xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision d41222bbcdea31b88e23614a3c2b1a0fe84fa572)
1 /*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include "src/mat/impls/aij/mpi/mpiaij.h"
7 #include "petscbt.h"
8 
9 static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
10 static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
11 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
12 EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13 EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS is[],int ov)
18 {
19   int i,ierr;
20 
21   PetscFunctionBegin;
22   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23   for (i=0; i<ov; ++i) {
24     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
25   }
26   PetscFunctionReturn(0);
27 }
28 
29 /*
30   Sample message format:
31   If a processor A wants processor B to process some elements corresponding
32   to index sets is[1],is[5]
33   mesg [0] = 2   (no of index sets in the mesg)
34   -----------
35   mesg [1] = 1 => is[1]
36   mesg [2] = sizeof(is[1]);
37   -----------
38   mesg [3] = 5  => is[5]
39   mesg [4] = sizeof(is[5]);
40   -----------
41   mesg [5]
42   mesg [n]  datas[1]
43   -----------
44   mesg[n+1]
45   mesg[m]  data(is[5])
46   -----------
47 
48   Notes:
49   nrqs - no of requests sent (or to be sent out)
50   nrqr - no of requests recieved (which have to be or which have been processed
51 */
52 #undef __FUNCT__
53 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
54 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS is[])
55 {
56   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
57   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
58   int         size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
59   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
60   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
61   PetscBT     *table;
62   MPI_Comm    comm;
63   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
64   MPI_Status  *s_status,*recv_status;
65 
66   PetscFunctionBegin;
67   comm   = C->comm;
68   size   = c->size;
69   rank   = c->rank;
70   m      = C->M;
71 
72   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
73   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
74 
75   len    = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
76   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
77   n      = (int*)(idx + imax);
78   rtable = n + imax;
79 
80   for (i=0; i<imax; i++) {
81     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
82     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
83   }
84 
85   /* Create hash table for the mapping :row -> proc*/
86   for (i=0,j=0; i<size; i++) {
87     len = c->rowners[i+1];
88     for (; j<len; j++) {
89       rtable[j] = i;
90     }
91   }
92 
93   /* evaluate communication - mesg to who,length of mesg, and buffer space
94      required. Based on this, buffers are allocated, and data copied into them*/
95   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
96   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
97   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
98   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
99   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
100   for (i=0; i<imax; i++) {
101     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
102     idx_i = idx[i];
103     len   = n[i];
104     for (j=0; j<len; j++) {
105       row  = idx_i[j];
106       if (row < 0) {
107         SETERRQ(1,"Index set cannot have negative entries");
108       }
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   len  = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
146   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
147   ptr  = outdat + size;     /* Pointers to the data in outgoing buffers */
148   ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
149   tmp  = (int*)(outdat + 2*size);
150   ctr  = tmp + msz;
151 
152   {
153     int *iptr = tmp,ict  = 0;
154     for (i=0; i<nrqs; i++) {
155       j         = pa[i];
156       iptr     +=  ict;
157       outdat[j] = iptr;
158       ict       = w1[j];
159     }
160   }
161 
162   /* Form the outgoing messages */
163   /*plug in the headers*/
164   for (i=0; i<nrqs; i++) {
165     j            = pa[i];
166     outdat[j][0] = 0;
167     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
168     ptr[j]       = outdat[j] + 2*w3[j] + 1;
169   }
170 
171   /* Memory for doing local proc's work*/
172   {
173     int  *d_p;
174     char *t_p;
175 
176     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
177       (m)*imax*sizeof(int)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
178     ierr  = PetscMalloc(len,&table);CHKERRQ(ierr);
179     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
180     data  = (int **)(table + imax);
181     isz   = (int  *)(data  + imax);
182     d_p   = (int  *)(isz   + imax);
183     t_p   = (char *)(d_p   + m*imax);
184     for (i=0; i<imax; i++) {
185       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186       data[i]  = d_p + (m)*i;
187     }
188   }
189 
190   /* Parse the IS and update local tables and the outgoing buf with the data*/
191   {
192     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193     PetscBT table_i;
194 
195     for (i=0; i<imax; i++) {
196       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
197       n_i     = n[i];
198       table_i = table[i];
199       idx_i   = idx[i];
200       data_i  = data[i];
201       isz_i   = isz[i];
202       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203         row  = idx_i[j];
204         proc = rtable[row];
205         if (proc != rank) { /* copy to the outgoing buffer */
206           ctr[proc]++;
207           *ptr[proc] = row;
208           ptr[proc]++;
209         } else { /* Update the local table */
210           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211         }
212       }
213       /* Update the headers for the current IS */
214       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215         if ((ctr_j = ctr[j])) {
216           outdat_j        = outdat[j];
217           k               = ++outdat_j[0];
218           outdat_j[2*k]   = ctr_j;
219           outdat_j[2*k-1] = i;
220         }
221       }
222       isz[i] = isz_i;
223     }
224   }
225 
226 
227 
228   /*  Now  post the sends */
229   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
230   for (i=0; i<nrqs; ++i) {
231     j    = pa[i];
232     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
233   }
234 
235   /* No longer need the original indices*/
236   for (i=0; i<imax; ++i) {
237     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
238   }
239   ierr = PetscFree(idx);CHKERRQ(ierr);
240 
241   for (i=0; i<imax; ++i) {
242     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
243   }
244 
245   /* Do Local work*/
246   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
247 
248   /* Receive messages*/
249   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
250   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
251 
252   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
253   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
254 
255   /* Phase 1 sends are complete - deallocate buffers */
256   ierr = PetscFree(outdat);CHKERRQ(ierr);
257   ierr = PetscFree(w1);CHKERRQ(ierr);
258 
259   ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr);
260   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
261   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
262   ierr = PetscFree(rbuf);CHKERRQ(ierr);
263 
264 
265  /* Send the data back*/
266   /* Do a global reduction to know the buffer space req for incoming messages*/
267   {
268     int *rw1;
269 
270     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
271     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
272 
273     for (i=0; i<nrqr; ++i) {
274       proc      = recv_status[i].MPI_SOURCE;
275       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
276       rw1[proc] = isz1[i];
277     }
278     ierr = PetscFree(onodes1);CHKERRQ(ierr);
279     ierr = PetscFree(olengths1);CHKERRQ(ierr);
280 
281     /* Determine the number of messages to expect, their lengths, from from-ids */
282     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
283     PetscFree(rw1);
284   }
285   /* Now post the Irecvs corresponding to these messages */
286   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
287 
288   /*  Now  post the sends */
289   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
290   for (i=0; i<nrqr; ++i) {
291     j    = recv_status[i].MPI_SOURCE;
292     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
293   }
294 
295   /* receive work done on other processors*/
296   {
297     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
298     PetscBT     table_i;
299     MPI_Status  *status2;
300 
301     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
302     for (i=0; i<nrqs; ++i) {
303       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
304       /* Process the message*/
305       rbuf2_i = rbuf2[idex];
306       ct1     = 2*rbuf2_i[0]+1;
307       jmax    = rbuf2[idex][0];
308       for (j=1; j<=jmax; j++) {
309         max     = rbuf2_i[2*j];
310         is_no   = rbuf2_i[2*j-1];
311         isz_i   = isz[is_no];
312         data_i  = data[is_no];
313         table_i = table[is_no];
314         for (k=0; k<max; k++,ct1++) {
315           row = rbuf2_i[ct1];
316           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317         }
318         isz[is_no] = isz_i;
319       }
320     }
321 
322     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
323     ierr = PetscFree(status2);CHKERRQ(ierr);
324   }
325 
326   for (i=0; i<imax; ++i) {
327     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
328   }
329 
330   ierr = PetscFree(onodes2);CHKERRQ(ierr);
331   ierr = PetscFree(olengths2);CHKERRQ(ierr);
332 
333   ierr = PetscFree(pa);CHKERRQ(ierr);
334   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
335   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
337   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
339   ierr = PetscFree(table);CHKERRQ(ierr);
340   ierr = PetscFree(s_status);CHKERRQ(ierr);
341   ierr = PetscFree(recv_status);CHKERRQ(ierr);
342   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
343   ierr = PetscFree(xdata);CHKERRQ(ierr);
344   ierr = PetscFree(isz1);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
350 /*
351    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
352        the work on the local processor.
353 
354      Inputs:
355       C      - MAT_MPIAIJ;
356       imax - total no of index sets processed at a time;
357       table  - an array of char - size = m bits.
358 
359      Output:
360       isz    - array containing the count of the solution elements correspondign
361                to each index set;
362       data   - pointer to the solutions
363 */
364 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
365 {
366   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
367   Mat        A = c->A,B = c->B;
368   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
369   int        start,end,val,max,rstart,cstart,*ai,*aj;
370   int        *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
371   PetscBT    table_i;
372 
373   PetscFunctionBegin;
374   rstart = c->rstart;
375   cstart = c->cstart;
376   ai     = a->i;
377   aj     = a->j;
378   bi     = b->i;
379   bj     = b->j;
380   garray = c->garray;
381 
382 
383   for (i=0; i<imax; i++) {
384     data_i  = data[i];
385     table_i = table[i];
386     isz_i   = isz[i];
387     for (j=0,max=isz[i]; j<max; j++) {
388       row   = data_i[j] - rstart;
389       start = ai[row];
390       end   = ai[row+1];
391       for (k=start; k<end; k++) { /* Amat */
392         val = aj[k] + cstart;
393         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
394       }
395       start = bi[row];
396       end   = bi[row+1];
397       for (k=start; k<end; k++) { /* Bmat */
398         val = garray[bj[k]];
399         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
400       }
401     }
402     isz[i] = isz_i;
403   }
404   PetscFunctionReturn(0);
405 }
406 
407 #undef __FUNCT__
408 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
409 /*
410       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
411          and return the output
412 
413          Input:
414            C    - the matrix
415            nrqr - no of messages being processed.
416            rbuf - an array of pointers to the recieved requests
417 
418          Output:
419            xdata - array of messages to be sent back
420            isz1  - size of each message
421 
422   For better efficiency perhaps we should malloc seperately each xdata[i],
423 then if a remalloc is required we need only copy the data for that one row
424 rather then all previous rows as it is now where a single large chunck of
425 memory is used.
426 
427 */
428 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
429 {
430   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
431   Mat         A = c->A,B = c->B;
432   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
433   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
434   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
435   int         val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
436   int         *rbuf_i,kmax,rbuf_0,ierr;
437   PetscBT     xtable;
438 
439   PetscFunctionBegin;
440   rank   = c->rank;
441   m      = C->M;
442   rstart = c->rstart;
443   cstart = c->cstart;
444   ai     = a->i;
445   aj     = a->j;
446   bi     = b->i;
447   bj     = b->j;
448   garray = c->garray;
449 
450 
451   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
452     rbuf_i  =  rbuf[i];
453     rbuf_0  =  rbuf_i[0];
454     ct     += rbuf_0;
455     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
456   }
457 
458   if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
459   else      max1 = 1;
460   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
461   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
462   ++no_malloc;
463   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
464   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
465 
466   ct3 = 0;
467   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
468     rbuf_i =  rbuf[i];
469     rbuf_0 =  rbuf_i[0];
470     ct1    =  2*rbuf_0+1;
471     ct2    =  ct1;
472     ct3    += ct1;
473     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
474       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
475       oct2 = ct2;
476       kmax = rbuf_i[2*j];
477       for (k=0; k<kmax; k++,ct1++) {
478         row = rbuf_i[ct1];
479         if (!PetscBTLookupSet(xtable,row)) {
480           if (!(ct3 < mem_estimate)) {
481             new_estimate = (int)(1.5*mem_estimate)+1;
482             ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
483             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
484             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
485             xdata[0]     = tmp;
486             mem_estimate = new_estimate; ++no_malloc;
487             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
488           }
489           xdata[i][ct2++] = row;
490           ct3++;
491         }
492       }
493       for (k=oct2,max2=ct2; k<max2; k++) {
494         row   = xdata[i][k] - rstart;
495         start = ai[row];
496         end   = ai[row+1];
497         for (l=start; l<end; l++) {
498           val = aj[l] + cstart;
499           if (!PetscBTLookupSet(xtable,val)) {
500             if (!(ct3 < mem_estimate)) {
501               new_estimate = (int)(1.5*mem_estimate)+1;
502               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
503               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
504               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
505               xdata[0]     = tmp;
506               mem_estimate = new_estimate; ++no_malloc;
507               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
508             }
509             xdata[i][ct2++] = val;
510             ct3++;
511           }
512         }
513         start = bi[row];
514         end   = bi[row+1];
515         for (l=start; l<end; l++) {
516           val = garray[bj[l]];
517           if (!PetscBTLookupSet(xtable,val)) {
518             if (!(ct3 < mem_estimate)) {
519               new_estimate = (int)(1.5*mem_estimate)+1;
520               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
521               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
522               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
523               xdata[0]     = tmp;
524               mem_estimate = new_estimate; ++no_malloc;
525               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
526             }
527             xdata[i][ct2++] = val;
528             ct3++;
529           }
530         }
531       }
532       /* Update the header*/
533       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
534       xdata[i][2*j-1] = rbuf_i[2*j-1];
535     }
536     xdata[i][0] = rbuf_0;
537     xdata[i+1]  = xdata[i] + ct2;
538     isz1[i]     = ct2; /* size of each message */
539   }
540   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
541   PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
542   PetscFunctionReturn(0);
543 }
544 /* -------------------------------------------------------------------------*/
545 EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,const IS[],const IS[],MatReuse,Mat*);
546 EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
547 /*
548     Every processor gets the entire matrix
549 */
550 #undef __FUNCT__
551 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
552 int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
553 {
554   Mat          B;
555   Mat_MPIAIJ   *a = (Mat_MPIAIJ *)A->data;
556   Mat_SeqAIJ   *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
557   int          ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
558   int          m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
559   PetscScalar  *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
560 
561   PetscFunctionBegin;
562   ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr);
563   ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr);
564 
565   if (scall == MAT_INITIAL_MATRIX) {
566     /* ----------------------------------------------------------------
567          Tell every processor the number of nonzeros per row
568     */
569     ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr);
570     for (i=a->rstart; i<a->rend; i++) {
571       lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
572     }
573     sendcount = a->rend - a->rstart;
574     ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
575     displs     = recvcounts + size;
576     for (i=0; i<size; i++) {
577       recvcounts[i] = a->rowners[i+1] - a->rowners[i];
578       displs[i]     = a->rowners[i];
579     }
580     ierr  = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
581 
582     /* ---------------------------------------------------------------
583          Create the sequential matrix of the same type as the local block diagonal
584     */
585     ierr  = MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);CHKERRQ(ierr);
586     ierr  = MatSetType(B,a->A->type_name);CHKERRQ(ierr);
587     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
588     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
589     **Bin = B;
590     b = (Mat_SeqAIJ *)B->data;
591 
592     /*--------------------------------------------------------------------
593        Copy my part of matrix column indices over
594     */
595     sendcount  = ad->nz + bd->nz;
596     jsendbuf   = b->j + b->i[rstarts[rank]];
597     a_jsendbuf = ad->j;
598     b_jsendbuf = bd->j;
599     n          = a->rend - a->rstart;
600     cnt        = 0;
601     for (i=0; i<n; i++) {
602 
603       /* put in lower diagonal portion */
604       m = bd->i[i+1] - bd->i[i];
605       while (m > 0) {
606         /* is it above diagonal (in bd (compressed) numbering) */
607         if (garray[*b_jsendbuf] > a->rstart + i) break;
608         jsendbuf[cnt++] = garray[*b_jsendbuf++];
609         m--;
610       }
611 
612       /* put in diagonal portion */
613       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
614         jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
615       }
616 
617       /* put in upper diagonal portion */
618       while (m-- > 0) {
619         jsendbuf[cnt++] = garray[*b_jsendbuf++];
620       }
621     }
622     if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
623 
624     /*--------------------------------------------------------------------
625        Gather all column indices to all processors
626     */
627     for (i=0; i<size; i++) {
628       recvcounts[i] = 0;
629       for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
630         recvcounts[i] += lens[j];
631       }
632     }
633     displs[0]  = 0;
634     for (i=1; i<size; i++) {
635       displs[i] = displs[i-1] + recvcounts[i-1];
636     }
637     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
638 
639     /*--------------------------------------------------------------------
640         Assemble the matrix into useable form (note numerical values not yet set)
641     */
642     /* set the b->ilen (length of each row) values */
643     ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr);
644     /* set the b->i indices */
645     b->i[0] = 0;
646     for (i=1; i<=A->M; i++) {
647       b->i[i] = b->i[i-1] + lens[i-1];
648     }
649     ierr = PetscFree(lens);CHKERRQ(ierr);
650     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
652 
653   } else {
654     B  = **Bin;
655     b = (Mat_SeqAIJ *)B->data;
656   }
657 
658   /*--------------------------------------------------------------------
659        Copy my part of matrix numerical values into the values location
660   */
661   sendcount = ad->nz + bd->nz;
662   sendbuf   = b->a + b->i[rstarts[rank]];
663   a_sendbuf = ad->a;
664   b_sendbuf = bd->a;
665   b_sendj   = bd->j;
666   n         = a->rend - a->rstart;
667   cnt       = 0;
668   for (i=0; i<n; i++) {
669 
670     /* put in lower diagonal portion */
671     m = bd->i[i+1] - bd->i[i];
672     while (m > 0) {
673       /* is it above diagonal (in bd (compressed) numbering) */
674       if (garray[*b_sendj] > a->rstart + i) break;
675       sendbuf[cnt++] = *b_sendbuf++;
676       m--;
677       b_sendj++;
678     }
679 
680     /* put in diagonal portion */
681     for (j=ad->i[i]; j<ad->i[i+1]; j++) {
682       sendbuf[cnt++] = *a_sendbuf++;
683     }
684 
685     /* put in upper diagonal portion */
686     while (m-- > 0) {
687       sendbuf[cnt++] = *b_sendbuf++;
688       b_sendj++;
689     }
690   }
691   if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
692 
693   /* -----------------------------------------------------------------
694      Gather all numerical values to all processors
695   */
696   if (!recvcounts) {
697     ierr   = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
698     displs = recvcounts + size;
699   }
700   for (i=0; i<size; i++) {
701     recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
702   }
703   displs[0]  = 0;
704   for (i=1; i<size; i++) {
705     displs[i] = displs[i-1] + recvcounts[i-1];
706   }
707   recvbuf   = b->a;
708   ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr);
709   ierr = PetscFree(recvcounts);CHKERRQ(ierr);
710   if (A->symmetric){
711     ierr = MatSetOption(B,MAT_SYMMETRIC);CHKERRQ(ierr);
712   } else if (A->hermitian) {
713     ierr = MatSetOption(B,MAT_HERMITIAN);CHKERRQ(ierr);
714   } else if (A->structurally_symmetric) {
715     ierr = MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC);CHKERRQ(ierr);
716   }
717 
718   PetscFunctionReturn(0);
719 }
720 
721 #undef __FUNCT__
722 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
723 int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
724 {
725   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
726   PetscTruth  rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
727 
728   PetscFunctionBegin;
729   /*
730        Check for special case each processor gets entire matrix
731   */
732   if (ismax == 1 && C->M == C->N) {
733     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
734     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
735     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
736     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
737     if (rowflag && colflag && nrow == C->M && ncol == C->N) {
738       wantallmatrix = PETSC_TRUE;
739       ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
740     }
741   }
742   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr);
743   if (twantallmatrix) {
744     ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr);
745     PetscFunctionReturn(0);
746   }
747 
748   /* Allocate memory to hold all the submatrices */
749   if (scall != MAT_REUSE_MATRIX) {
750     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
751   }
752   /* Determine the number of stages through which submatrices are done */
753   nmax          = 20*1000000 / (C->N * sizeof(int));
754   if (!nmax) nmax = 1;
755   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
756 
757   /* Make sure every processor loops through the nstages */
758   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
759 
760   for (i=0,pos=0; i<nstages; i++) {
761     if (pos+nmax <= ismax) max_no = nmax;
762     else if (pos == ismax) max_no = 0;
763     else                   max_no = ismax-pos;
764     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
765     pos += max_no;
766   }
767   PetscFunctionReturn(0);
768 }
769 /* -------------------------------------------------------------------------*/
770 #undef __FUNCT__
771 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
772 int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
773 {
774   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
775   Mat         A = c->A;
776   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
777   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
778   int         **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
779   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
780   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
781   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
782   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
783   int         *rmap_i,tag0,tag1,tag2,tag3;
784   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
785   MPI_Request *r_waits4,*s_waits3,*s_waits4;
786   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
787   MPI_Status  *r_status3,*r_status4,*s_status4;
788   MPI_Comm    comm;
789   PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
790   PetscTruth  sorted;
791   int         *onodes1,*olengths1;
792 
793   PetscFunctionBegin;
794   comm   = C->comm;
795   tag0   = C->tag;
796   size   = c->size;
797   rank   = c->rank;
798   m      = C->M;
799 
800   /* Get some new tags to keep the communication clean */
801   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
802   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
803   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
804 
805     /* Check if the col indices are sorted */
806   for (i=0; i<ismax; i++) {
807     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
808     if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
809     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
810     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
811   }
812 
813   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
814   ierr   = PetscMalloc(len,&irow);CHKERRQ(ierr);
815   icol   = irow + ismax;
816   nrow   = (int*)(icol + ismax);
817   ncol   = nrow + ismax;
818   rtable = ncol + ismax;
819 
820   for (i=0; i<ismax; i++) {
821     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
822     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
823     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
824     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
825   }
826 
827   /* Create hash table for the mapping :row -> proc*/
828   for (i=0,j=0; i<size; i++) {
829     jmax = c->rowners[i+1];
830     for (; j<jmax; j++) {
831       rtable[j] = i;
832     }
833   }
834 
835   /* evaluate communication - mesg to who, length of mesg, and buffer space
836      required. Based on this, buffers are allocated, and data copied into them*/
837   ierr   = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
838   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
839   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
840   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
841   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
842   for (i=0; i<ismax; i++) {
843     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
844     jmax   = nrow[i];
845     irow_i = irow[i];
846     for (j=0; j<jmax; j++) {
847       row  = irow_i[j];
848       proc = rtable[row];
849       w4[proc]++;
850     }
851     for (j=0; j<size; j++) {
852       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
853     }
854   }
855 
856   nrqs     = 0;              /* no of outgoing messages */
857   msz      = 0;              /* total mesg length (for all procs) */
858   w1[rank] = 0;              /* no mesg sent to self */
859   w3[rank] = 0;
860   for (i=0; i<size; i++) {
861     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
862   }
863   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
864   for (i=0,j=0; i<size; i++) {
865     if (w1[i]) { pa[j] = i; j++; }
866   }
867 
868   /* Each message would have a header = 1 + 2*(no of IS) + data */
869   for (i=0; i<nrqs; i++) {
870     j     = pa[i];
871     w1[j] += w2[j] + 2* w3[j];
872     msz   += w1[j];
873   }
874 
875   /* Determine the number of messages to expect, their lengths, from from-ids */
876   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
877   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
878 
879   /* Now post the Irecvs corresponding to these messages */
880   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
881 
882   ierr = PetscFree(onodes1);CHKERRQ(ierr);
883   ierr = PetscFree(olengths1);CHKERRQ(ierr);
884 
885   /* Allocate Memory for outgoing messages */
886   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
887   ierr     = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
888   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
889   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
890   /* allocate memory for outgoing data + buf to receive the first reply */
891   tmp      = (int*)(ptr + size);
892   ctr      = tmp + 2*msz;
893 
894   {
895     int *iptr = tmp,ict = 0;
896     for (i=0; i<nrqs; i++) {
897       j         = pa[i];
898       iptr     += ict;
899       sbuf1[j]  = iptr;
900       ict       = w1[j];
901     }
902   }
903 
904   /* Form the outgoing messages */
905   /* Initialize the header space */
906   for (i=0; i<nrqs; i++) {
907     j           = pa[i];
908     sbuf1[j][0] = 0;
909     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
910     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
911   }
912 
913   /* Parse the isrow and copy data into outbuf */
914   for (i=0; i<ismax; i++) {
915     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
916     irow_i = irow[i];
917     jmax   = nrow[i];
918     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
919       row  = irow_i[j];
920       proc = rtable[row];
921       if (proc != rank) { /* copy to the outgoing buf*/
922         ctr[proc]++;
923         *ptr[proc] = row;
924         ptr[proc]++;
925       }
926     }
927     /* Update the headers for the current IS */
928     for (j=0; j<size; j++) { /* Can Optimise this loop too */
929       if ((ctr_j = ctr[j])) {
930         sbuf1_j        = sbuf1[j];
931         k              = ++sbuf1_j[0];
932         sbuf1_j[2*k]   = ctr_j;
933         sbuf1_j[2*k-1] = i;
934       }
935     }
936   }
937 
938   /*  Now  post the sends */
939   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
940   for (i=0; i<nrqs; ++i) {
941     j    = pa[i];
942     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
943   }
944 
945   /* Post Receives to capture the buffer size */
946   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
947   ierr     = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr);
948   rbuf2[0] = tmp + msz;
949   for (i=1; i<nrqs; ++i) {
950     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
951   }
952   for (i=0; i<nrqs; ++i) {
953     j    = pa[i];
954     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
955   }
956 
957   /* Send to other procs the buf size they should allocate */
958 
959 
960   /* Receive messages*/
961   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
962   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
963   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
964   ierr        = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
965   req_size    = (int*)(sbuf2 + nrqr);
966   req_source  = req_size + nrqr;
967 
968   {
969     Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
970     int        *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
971     int        *sbuf2_i;
972 
973     for (i=0; i<nrqr; ++i) {
974       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
975       req_size[idex] = 0;
976       rbuf1_i         = rbuf1[idex];
977       start           = 2*rbuf1_i[0] + 1;
978       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
979       ierr            = PetscMalloc((end+1)*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr);
980       sbuf2_i         = sbuf2[idex];
981       for (j=start; j<end; j++) {
982         id               = rbuf1_i[j] - rstart;
983         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
984         sbuf2_i[j]       = ncols;
985         req_size[idex] += ncols;
986       }
987       req_source[idex] = r_status1[i].MPI_SOURCE;
988       /* form the header */
989       sbuf2_i[0]   = req_size[idex];
990       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
991       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
992     }
993   }
994   ierr = PetscFree(r_status1);CHKERRQ(ierr);
995   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
996 
997   /*  recv buffer sizes */
998   /* Receive messages*/
999 
1000   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
1001   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
1002   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
1003   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
1004   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1005 
1006   for (i=0; i<nrqs; ++i) {
1007     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1008     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr);
1009     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1010     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1011     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1012   }
1013   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1015 
1016   /* Wait on sends1 and sends2 */
1017   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1018   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1019 
1020   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1021   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1022   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1023   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1024   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1025   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1026 
1027   /* Now allocate buffers for a->j, and send them off */
1028   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr);
1029   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1030   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
1031   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1032 
1033   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1034   {
1035     int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1036     int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1037     int *a_j = a->j,*b_j = b->j,ctmp;
1038 
1039     for (i=0; i<nrqr; i++) {
1040       rbuf1_i   = rbuf1[i];
1041       sbuf_aj_i = sbuf_aj[i];
1042       ct1       = 2*rbuf1_i[0] + 1;
1043       ct2       = 0;
1044       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1045         kmax = rbuf1[i][2*j];
1046         for (k=0; k<kmax; k++,ct1++) {
1047           row    = rbuf1_i[ct1] - rstart;
1048           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1049           ncols  = nzA + nzB;
1050           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1051 
1052           /* load the column indices for this row into cols*/
1053           cols  = sbuf_aj_i + ct2;
1054 
1055           for (l=0; l<nzB; l++) {
1056             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1057             else break;
1058           }
1059           imark = l;
1060           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1061           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1062 
1063           ct2 += ncols;
1064         }
1065       }
1066       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1067     }
1068   }
1069   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1070   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1071 
1072   /* Allocate buffers for a->a, and send them off */
1073   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1074   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1075   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1076   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1077 
1078   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1079   {
1080     int    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1081     int    cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1082     int    *b_j = b->j;
1083     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1084 
1085     for (i=0; i<nrqr; i++) {
1086       rbuf1_i   = rbuf1[i];
1087       sbuf_aa_i = sbuf_aa[i];
1088       ct1       = 2*rbuf1_i[0]+1;
1089       ct2       = 0;
1090       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1091         kmax = rbuf1_i[2*j];
1092         for (k=0; k<kmax; k++,ct1++) {
1093           row    = rbuf1_i[ct1] - rstart;
1094           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1095           ncols  = nzA + nzB;
1096           cworkB = b_j + b_i[row];
1097           vworkA = a_a + a_i[row];
1098           vworkB = b_a + b_i[row];
1099 
1100           /* load the column values for this row into vals*/
1101           vals  = sbuf_aa_i+ct2;
1102 
1103           for (l=0; l<nzB; l++) {
1104             if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
1105             else break;
1106           }
1107           imark = l;
1108           for (l=0; l<nzA; l++)   vals[imark+l] = vworkA[l];
1109           for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1110 
1111           ct2 += ncols;
1112         }
1113       }
1114       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1115     }
1116   }
1117   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1118   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1119   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1120 
1121   /* Form the matrix */
1122   /* create col map */
1123   {
1124     int *icol_i;
1125 
1126     len     = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1127     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
1128     cmap[0] = (int *)(cmap + ismax);
1129     ierr    = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr);
1130     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1131     for (i=0; i<ismax; i++) {
1132       jmax   = ncol[i];
1133       icol_i = icol[i];
1134       cmap_i = cmap[i];
1135       for (j=0; j<jmax; j++) {
1136         cmap_i[icol_i[j]] = j+1;
1137       }
1138     }
1139   }
1140 
1141   /* Create lens which is required for MatCreate... */
1142   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1143   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1144   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1145   lens[0] = (int *)(lens + ismax);
1146   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1147   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1148 
1149   /* Update lens from local data */
1150   for (i=0; i<ismax; i++) {
1151     jmax   = nrow[i];
1152     cmap_i = cmap[i];
1153     irow_i = irow[i];
1154     lens_i = lens[i];
1155     for (j=0; j<jmax; j++) {
1156       row  = irow_i[j];
1157       proc = rtable[row];
1158       if (proc == rank) {
1159         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1160         for (k=0; k<ncols; k++) {
1161           if (cmap_i[cols[k]]) { lens_i[j]++;}
1162         }
1163         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1164       }
1165     }
1166   }
1167 
1168   /* Create row map*/
1169   len     = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1170   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1171   rmap[0] = (int *)(rmap + ismax);
1172   ierr    = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr);
1173   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1174   for (i=0; i<ismax; i++) {
1175     rmap_i = rmap[i];
1176     irow_i = irow[i];
1177     jmax   = nrow[i];
1178     for (j=0; j<jmax; j++) {
1179       rmap_i[irow_i[j]] = j;
1180     }
1181   }
1182 
1183   /* Update lens from offproc data */
1184   {
1185     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1186 
1187     for (tmp2=0; tmp2<nrqs; tmp2++) {
1188       ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1189       idex   = pa[i];
1190       sbuf1_i = sbuf1[idex];
1191       jmax    = sbuf1_i[0];
1192       ct1     = 2*jmax+1;
1193       ct2     = 0;
1194       rbuf2_i = rbuf2[i];
1195       rbuf3_i = rbuf3[i];
1196       for (j=1; j<=jmax; j++) {
1197         is_no   = sbuf1_i[2*j-1];
1198         max1    = sbuf1_i[2*j];
1199         lens_i  = lens[is_no];
1200         cmap_i  = cmap[is_no];
1201         rmap_i  = rmap[is_no];
1202         for (k=0; k<max1; k++,ct1++) {
1203           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1204           max2 = rbuf2_i[ct1];
1205           for (l=0; l<max2; l++,ct2++) {
1206             if (cmap_i[rbuf3_i[ct2]]) {
1207               lens_i[row]++;
1208             }
1209           }
1210         }
1211       }
1212     }
1213   }
1214   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1215   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1216   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1217   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1218   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1219 
1220   /* Create the submatrices */
1221   if (scall == MAT_REUSE_MATRIX) {
1222     PetscTruth flag;
1223 
1224     /*
1225         Assumes new rows are same length as the old rows,hence bug!
1226     */
1227     for (i=0; i<ismax; i++) {
1228       mat = (Mat_SeqAIJ *)(submats[i]->data);
1229       if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1230         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1231       }
1232       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr);
1233       if (flag == PETSC_FALSE) {
1234         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1235       }
1236       /* Initial matrix as if empty */
1237       ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr);
1238       submats[i]->factor = C->factor;
1239     }
1240   } else {
1241     for (i=0; i<ismax; i++) {
1242       ierr = MatCreate(PETSC_COMM_SELF,nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE,submats+i);CHKERRQ(ierr);
1243       ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr);
1244       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1245     }
1246   }
1247 
1248   /* Assemble the matrices */
1249   /* First assemble the local rows */
1250   {
1251     int    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1252     PetscScalar *imat_a;
1253 
1254     for (i=0; i<ismax; i++) {
1255       mat       = (Mat_SeqAIJ*)submats[i]->data;
1256       imat_ilen = mat->ilen;
1257       imat_j    = mat->j;
1258       imat_i    = mat->i;
1259       imat_a    = mat->a;
1260       cmap_i    = cmap[i];
1261       rmap_i    = rmap[i];
1262       irow_i    = irow[i];
1263       jmax      = nrow[i];
1264       for (j=0; j<jmax; j++) {
1265         row      = irow_i[j];
1266         proc     = rtable[row];
1267         if (proc == rank) {
1268           old_row  = row;
1269           row      = rmap_i[row];
1270           ilen_row = imat_ilen[row];
1271           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1272           mat_i    = imat_i[row] ;
1273           mat_a    = imat_a + mat_i;
1274           mat_j    = imat_j + mat_i;
1275           for (k=0; k<ncols; k++) {
1276             if ((tcol = cmap_i[cols[k]])) {
1277               *mat_j++ = tcol - 1;
1278               *mat_a++ = vals[k];
1279               ilen_row++;
1280             }
1281           }
1282           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1283           imat_ilen[row] = ilen_row;
1284         }
1285       }
1286     }
1287   }
1288 
1289   /*   Now assemble the off proc rows*/
1290   {
1291     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1292     int    *imat_j,*imat_i;
1293     PetscScalar *imat_a,*rbuf4_i;
1294 
1295     for (tmp2=0; tmp2<nrqs; tmp2++) {
1296       ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1297       idex   = pa[i];
1298       sbuf1_i = sbuf1[idex];
1299       jmax    = sbuf1_i[0];
1300       ct1     = 2*jmax + 1;
1301       ct2     = 0;
1302       rbuf2_i = rbuf2[i];
1303       rbuf3_i = rbuf3[i];
1304       rbuf4_i = rbuf4[i];
1305       for (j=1; j<=jmax; j++) {
1306         is_no     = sbuf1_i[2*j-1];
1307         rmap_i    = rmap[is_no];
1308         cmap_i    = cmap[is_no];
1309         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1310         imat_ilen = mat->ilen;
1311         imat_j    = mat->j;
1312         imat_i    = mat->i;
1313         imat_a    = mat->a;
1314         max1      = sbuf1_i[2*j];
1315         for (k=0; k<max1; k++,ct1++) {
1316           row   = sbuf1_i[ct1];
1317           row   = rmap_i[row];
1318           ilen  = imat_ilen[row];
1319           mat_i = imat_i[row] ;
1320           mat_a = imat_a + mat_i;
1321           mat_j = imat_j + mat_i;
1322           max2 = rbuf2_i[ct1];
1323           for (l=0; l<max2; l++,ct2++) {
1324             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1325               *mat_j++ = tcol - 1;
1326               *mat_a++ = rbuf4_i[ct2];
1327               ilen++;
1328             }
1329           }
1330           imat_ilen[row] = ilen;
1331         }
1332       }
1333     }
1334   }
1335   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1336   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1337   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1338   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1339   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1340 
1341   /* Restore the indices */
1342   for (i=0; i<ismax; i++) {
1343     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1344     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1345   }
1346 
1347   /* Destroy allocated memory */
1348   ierr = PetscFree(irow);CHKERRQ(ierr);
1349   ierr = PetscFree(w1);CHKERRQ(ierr);
1350   ierr = PetscFree(pa);CHKERRQ(ierr);
1351 
1352   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1353   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1354   for (i=0; i<nrqr; ++i) {
1355     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1356   }
1357   for (i=0; i<nrqs; ++i) {
1358     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1359     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1360   }
1361 
1362   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1363   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1364   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1365   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1366   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1367   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1368   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1369 
1370   ierr = PetscFree(cmap);CHKERRQ(ierr);
1371   ierr = PetscFree(rmap);CHKERRQ(ierr);
1372   ierr = PetscFree(lens);CHKERRQ(ierr);
1373 
1374   for (i=0; i<ismax; i++) {
1375     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1376     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1377   }
1378   PetscFunctionReturn(0);
1379 }
1380 
1381