xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 73f4d3771d9e6ab3f04055eab794d7609818b9d3)
1 /*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include "src/mat/impls/aij/mpi/mpiaij.h"
7 #include "petscbt.h"
8 
9 static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
10 static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
11 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
12 EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13 EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS *is,int ov)
18 {
19   int i,ierr;
20 
21   PetscFunctionBegin;
22   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23   for (i=0; i<ov; ++i) {
24     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
25   }
26   PetscFunctionReturn(0);
27 }
28 
29 /*
30   Sample message format:
31   If a processor A wants processor B to process some elements corresponding
32   to index sets is[1],is[5]
33   mesg [0] = 2   (no of index sets in the mesg)
34   -----------
35   mesg [1] = 1 => is[1]
36   mesg [2] = sizeof(is[1]);
37   -----------
38   mesg [3] = 5  => is[5]
39   mesg [4] = sizeof(is[5]);
40   -----------
41   mesg [5]
42   mesg [n]  datas[1]
43   -----------
44   mesg[n+1]
45   mesg[m]  data(is[5])
46   -----------
47 
48   Notes:
49   nrqs - no of requests sent (or to be sent out)
50   nrqr - no of requests recieved (which have to be or which have been processed
51 */
52 #undef __FUNCT__
53 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
54 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS *is)
55 {
56   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
57   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
58   int         size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
59   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
60   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
61   PetscBT     *table;
62   MPI_Comm    comm;
63   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
64   MPI_Status  *s_status,*recv_status;
65 
66   PetscFunctionBegin;
67   comm   = C->comm;
68   size   = c->size;
69   rank   = c->rank;
70   m      = C->M;
71 
72   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
73   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
74 
75   len    = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
76   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
77   n      = (int*)(idx + imax);
78   rtable = n + imax;
79 
80   for (i=0; i<imax; i++) {
81     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
82     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
83   }
84 
85   /* Create hash table for the mapping :row -> proc*/
86   for (i=0,j=0; i<size; i++) {
87     len = c->rowners[i+1];
88     for (; j<len; j++) {
89       rtable[j] = i;
90     }
91   }
92 
93   /* evaluate communication - mesg to who,length of mesg, and buffer space
94      required. Based on this, buffers are allocated, and data copied into them*/
95   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
96   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
97   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
98   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
99   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
100   for (i=0; i<imax; i++) {
101     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
102     idx_i = idx[i];
103     len   = n[i];
104     for (j=0; j<len; j++) {
105       row  = idx_i[j];
106       if (row < 0) {
107         SETERRQ(1,"Index set cannot have negative entries");
108       }
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   len  = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
146   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
147   ptr  = outdat + size;     /* Pointers to the data in outgoing buffers */
148   ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
149   tmp  = (int*)(outdat + 2*size);
150   ctr  = tmp + msz;
151 
152   {
153     int *iptr = tmp,ict  = 0;
154     for (i=0; i<nrqs; i++) {
155       j         = pa[i];
156       iptr     +=  ict;
157       outdat[j] = iptr;
158       ict       = w1[j];
159     }
160   }
161 
162   /* Form the outgoing messages */
163   /*plug in the headers*/
164   for (i=0; i<nrqs; i++) {
165     j            = pa[i];
166     outdat[j][0] = 0;
167     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
168     ptr[j]       = outdat[j] + 2*w3[j] + 1;
169   }
170 
171   /* Memory for doing local proc's work*/
172   {
173     int  *d_p;
174     char *t_p;
175 
176     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
177       (m)*imax*sizeof(int)  + (m/BITSPERBYTE+1)*imax*sizeof(char) + 1;
178     ierr  = PetscMalloc(len,&table);CHKERRQ(ierr);
179     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
180     data  = (int **)(table + imax);
181     isz   = (int  *)(data  + imax);
182     d_p   = (int  *)(isz   + imax);
183     t_p   = (char *)(d_p   + m*imax);
184     for (i=0; i<imax; i++) {
185       table[i] = t_p + (m/BITSPERBYTE+1)*i;
186       data[i]  = d_p + (m)*i;
187     }
188   }
189 
190   /* Parse the IS and update local tables and the outgoing buf with the data*/
191   {
192     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193     PetscBT table_i;
194 
195     for (i=0; i<imax; i++) {
196       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
197       n_i     = n[i];
198       table_i = table[i];
199       idx_i   = idx[i];
200       data_i  = data[i];
201       isz_i   = isz[i];
202       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203         row  = idx_i[j];
204         proc = rtable[row];
205         if (proc != rank) { /* copy to the outgoing buffer */
206           ctr[proc]++;
207           *ptr[proc] = row;
208           ptr[proc]++;
209         } else { /* Update the local table */
210           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211         }
212       }
213       /* Update the headers for the current IS */
214       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215         if ((ctr_j = ctr[j])) {
216           outdat_j        = outdat[j];
217           k               = ++outdat_j[0];
218           outdat_j[2*k]   = ctr_j;
219           outdat_j[2*k-1] = i;
220         }
221       }
222       isz[i] = isz_i;
223     }
224   }
225 
226 
227 
228   /*  Now  post the sends */
229   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
230   for (i=0; i<nrqs; ++i) {
231     j    = pa[i];
232     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
233   }
234 
235   /* No longer need the original indices*/
236   for (i=0; i<imax; ++i) {
237     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
238   }
239   ierr = PetscFree(idx);CHKERRQ(ierr);
240 
241   for (i=0; i<imax; ++i) {
242     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
243   }
244 
245   /* Do Local work*/
246   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
247 
248   /* Receive messages*/
249   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
250   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
251 
252   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
253   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
254 
255   /* Phase 1 sends are complete - deallocate buffers */
256   ierr = PetscFree(outdat);CHKERRQ(ierr);
257   ierr = PetscFree(w1);CHKERRQ(ierr);
258 
259   ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr);
260   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
261   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
262   ierr = PetscFree(rbuf);CHKERRQ(ierr);
263 
264 
265  /* Send the data back*/
266   /* Do a global reduction to know the buffer space req for incoming messages*/
267   {
268     int *rw1;
269 
270     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
271     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
272 
273     for (i=0; i<nrqr; ++i) {
274       proc      = recv_status[i].MPI_SOURCE;
275       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
276       rw1[proc] = isz1[i];
277     }
278     ierr = PetscFree(onodes1);CHKERRQ(ierr);
279     ierr = PetscFree(olengths1);CHKERRQ(ierr);
280 
281     /* Determine the number of messages to expect, their lengths, from from-ids */
282     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
283     PetscFree(rw1);
284   }
285   /* Now post the Irecvs corresponding to these messages */
286   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
287 
288   /*  Now  post the sends */
289   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
290   for (i=0; i<nrqr; ++i) {
291     j    = recv_status[i].MPI_SOURCE;
292     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
293   }
294 
295   /* receive work done on other processors*/
296   {
297     int         index,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
298     PetscBT     table_i;
299     MPI_Status  *status2;
300 
301     ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
302     for (i=0; i<nrqs; ++i) {
303       ierr = MPI_Waitany(nrqs,r_waits2,&index,status2+i);CHKERRQ(ierr);
304       /* Process the message*/
305       rbuf2_i = rbuf2[index];
306       ct1     = 2*rbuf2_i[0]+1;
307       jmax    = rbuf2[index][0];
308       for (j=1; j<=jmax; j++) {
309         max     = rbuf2_i[2*j];
310         is_no   = rbuf2_i[2*j-1];
311         isz_i   = isz[is_no];
312         data_i  = data[is_no];
313         table_i = table[is_no];
314         for (k=0; k<max; k++,ct1++) {
315           row = rbuf2_i[ct1];
316           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317         }
318         isz[is_no] = isz_i;
319       }
320     }
321 
322     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
323     ierr = PetscFree(status2);CHKERRQ(ierr);
324   }
325 
326   for (i=0; i<imax; ++i) {
327     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
328   }
329 
330   ierr = PetscFree(onodes2);CHKERRQ(ierr);
331   ierr = PetscFree(olengths2);CHKERRQ(ierr);
332 
333   ierr = PetscFree(pa);CHKERRQ(ierr);
334   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
335   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
337   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
339   ierr = PetscFree(table);CHKERRQ(ierr);
340   ierr = PetscFree(s_status);CHKERRQ(ierr);
341   ierr = PetscFree(recv_status);CHKERRQ(ierr);
342   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
343   ierr = PetscFree(xdata);CHKERRQ(ierr);
344   ierr = PetscFree(isz1);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
350 /*
351    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
352        the work on the local processor.
353 
354      Inputs:
355       C      - MAT_MPIAIJ;
356       imax - total no of index sets processed at a time;
357       table  - an array of char - size = m bits.
358 
359      Output:
360       isz    - array containing the count of the solution elements correspondign
361                to each index set;
362       data   - pointer to the solutions
363 */
364 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
365 {
366   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
367   Mat        A = c->A,B = c->B;
368   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
369   int        start,end,val,max,rstart,cstart,ashift,bshift,*ai,*aj;
370   int        *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
371   PetscBT    table_i;
372 
373   PetscFunctionBegin;
374   rstart = c->rstart;
375   cstart = c->cstart;
376   ashift = a->indexshift;
377   ai     = a->i;
378   aj     = a->j +ashift;
379   bshift = b->indexshift;
380   bi     = b->i;
381   bj     = b->j +bshift;
382   garray = c->garray;
383 
384 
385   for (i=0; i<imax; i++) {
386     data_i  = data[i];
387     table_i = table[i];
388     isz_i   = isz[i];
389     for (j=0,max=isz[i]; j<max; j++) {
390       row   = data_i[j] - rstart;
391       start = ai[row];
392       end   = ai[row+1];
393       for (k=start; k<end; k++) { /* Amat */
394         val = aj[k] + ashift + cstart;
395         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
396       }
397       start = bi[row];
398       end   = bi[row+1];
399       for (k=start; k<end; k++) { /* Bmat */
400         val = garray[bj[k]+bshift];
401         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
402       }
403     }
404     isz[i] = isz_i;
405   }
406   PetscFunctionReturn(0);
407 }
408 
409 #undef __FUNCT__
410 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
411 /*
412       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
413          and return the output
414 
415          Input:
416            C    - the matrix
417            nrqr - no of messages being processed.
418            rbuf - an array of pointers to the recieved requests
419 
420          Output:
421            xdata - array of messages to be sent back
422            isz1  - size of each message
423 
424   For better efficiency perhaps we should malloc seperately each xdata[i],
425 then if a remalloc is required we need only copy the data for that one row
426 rather then all previous rows as it is now where a single large chunck of
427 memory is used.
428 
429 */
430 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
431 {
432   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
433   Mat         A = c->A,B = c->B;
434   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
435   int         rstart,cstart,ashift,bshift,*ai,*aj,*bi,*bj,*garray,i,j,k;
436   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
437   int         val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
438   int         *rbuf_i,kmax,rbuf_0,ierr;
439   PetscBT     xtable;
440 
441   PetscFunctionBegin;
442   rank   = c->rank;
443   m      = C->M;
444   rstart = c->rstart;
445   cstart = c->cstart;
446   ashift = a->indexshift;
447   ai     = a->i;
448   aj     = a->j +ashift;
449   bshift = b->indexshift;
450   bi     = b->i;
451   bj     = b->j +bshift;
452   garray = c->garray;
453 
454 
455   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
456     rbuf_i  =  rbuf[i];
457     rbuf_0  =  rbuf_i[0];
458     ct     += rbuf_0;
459     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
460   }
461 
462   if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
463   else      max1 = 1;
464   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
465   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
466   ++no_malloc;
467   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
468   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
469 
470   ct3 = 0;
471   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
472     rbuf_i =  rbuf[i];
473     rbuf_0 =  rbuf_i[0];
474     ct1    =  2*rbuf_0+1;
475     ct2    =  ct1;
476     ct3    += ct1;
477     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
478       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
479       oct2 = ct2;
480       kmax = rbuf_i[2*j];
481       for (k=0; k<kmax; k++,ct1++) {
482         row = rbuf_i[ct1];
483         if (!PetscBTLookupSet(xtable,row)) {
484           if (!(ct3 < mem_estimate)) {
485             new_estimate = (int)(1.5*mem_estimate)+1;
486             ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
487             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
488             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
489             xdata[0]     = tmp;
490             mem_estimate = new_estimate; ++no_malloc;
491             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
492           }
493           xdata[i][ct2++] = row;
494           ct3++;
495         }
496       }
497       for (k=oct2,max2=ct2; k<max2; k++) {
498         row   = xdata[i][k] - rstart;
499         start = ai[row];
500         end   = ai[row+1];
501         for (l=start; l<end; l++) {
502           val = aj[l] + ashift + cstart;
503           if (!PetscBTLookupSet(xtable,val)) {
504             if (!(ct3 < mem_estimate)) {
505               new_estimate = (int)(1.5*mem_estimate)+1;
506               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
507               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
508               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
509               xdata[0]     = tmp;
510               mem_estimate = new_estimate; ++no_malloc;
511               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
512             }
513             xdata[i][ct2++] = val;
514             ct3++;
515           }
516         }
517         start = bi[row];
518         end   = bi[row+1];
519         for (l=start; l<end; l++) {
520           val = garray[bj[l]+bshift];
521           if (!PetscBTLookupSet(xtable,val)) {
522             if (!(ct3 < mem_estimate)) {
523               new_estimate = (int)(1.5*mem_estimate)+1;
524               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
525               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
526               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
527               xdata[0]     = tmp;
528               mem_estimate = new_estimate; ++no_malloc;
529               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
530             }
531             xdata[i][ct2++] = val;
532             ct3++;
533           }
534         }
535       }
536       /* Update the header*/
537       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
538       xdata[i][2*j-1] = rbuf_i[2*j-1];
539     }
540     xdata[i][0] = rbuf_0;
541     xdata[i+1]  = xdata[i] + ct2;
542     isz1[i]     = ct2; /* size of each message */
543   }
544   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
545   PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
546   PetscFunctionReturn(0);
547 }
548 /* -------------------------------------------------------------------------*/
549 EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,IS*,IS*,MatReuse,Mat*);
550 EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
551 /*
552     Every processor gets the entire matrix
553 */
554 #undef __FUNCT__
555 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
556 int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat **Bin)
557 {
558   Mat          B;
559   Mat_MPIAIJ   *a = (Mat_MPIAIJ *)A->data;
560   Mat_SeqAIJ   *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
561   int          ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
562   int          m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
563   PetscScalar  *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
564 
565   PetscFunctionBegin;
566   ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr);
567   ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr);
568 
569   if (scall == MAT_INITIAL_MATRIX) {
570     /* ----------------------------------------------------------------
571          Tell every processor the number of nonzeros per row
572     */
573     ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr);
574     for (i=a->rstart; i<a->rend; i++) {
575       lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
576     }
577     sendcount = a->rend - a->rstart;
578     ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
579     displs     = recvcounts + size;
580     for (i=0; i<size; i++) {
581       recvcounts[i] = a->rowners[i+1] - a->rowners[i];
582       displs[i]     = a->rowners[i];
583     }
584     ierr  = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
585 
586     /* ---------------------------------------------------------------
587          Create the sequential matrix
588     */
589     ierr  = MatCreateSeqAIJ(PETSC_COMM_SELF,A->M,A->N,0,lens,&B);CHKERRQ(ierr);
590     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
591     **Bin = B;
592     b = (Mat_SeqAIJ *)B->data;
593 
594     /*--------------------------------------------------------------------
595        Copy my part of matrix column indices over
596     */
597     sendcount  = ad->nz + bd->nz;
598     jsendbuf   = b->j + b->i[rstarts[rank]];
599     a_jsendbuf = ad->j;
600     b_jsendbuf = bd->j;
601     n          = a->rend - a->rstart;
602     cnt        = 0;
603     for (i=0; i<n; i++) {
604 
605       /* put in lower diagonal portion */
606       m = bd->i[i+1] - bd->i[i];
607       while (m > 0) {
608         /* is it above diagonal (in bd (compressed) numbering) */
609         if (garray[*b_jsendbuf] > a->rstart + i) break;
610         jsendbuf[cnt++] = garray[*b_jsendbuf++];
611         m--;
612       }
613 
614       /* put in diagonal portion */
615       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
616         jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
617       }
618 
619       /* put in upper diagonal portion */
620       while (m-- > 0) {
621         jsendbuf[cnt++] = garray[*b_jsendbuf++];
622       }
623     }
624     if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
625 
626     /*--------------------------------------------------------------------
627        Gather all column indices to all processors
628     */
629     for (i=0; i<size; i++) {
630       recvcounts[i] = 0;
631       for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
632         recvcounts[i] += lens[j];
633       }
634     }
635     displs[0]  = 0;
636     for (i=1; i<size; i++) {
637       displs[i] = displs[i-1] + recvcounts[i-1];
638     }
639     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
640 
641     /*--------------------------------------------------------------------
642         Assemble the matrix into useable form (note numerical values not yet set)
643     */
644     /* set the b->ilen (length of each row) values */
645     ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr);
646     /* set the b->i indices */
647     b->i[0] = 0;
648     for (i=1; i<=A->M; i++) {
649       b->i[i] = b->i[i-1] + lens[i-1];
650     }
651     ierr = PetscFree(lens);CHKERRQ(ierr);
652     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
653     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
654   } else {
655     B  = **Bin;
656     b = (Mat_SeqAIJ *)B->data;
657   }
658 
659   /*--------------------------------------------------------------------
660        Copy my part of matrix numerical values into the values location
661   */
662   sendcount = ad->nz + bd->nz;
663   sendbuf   = b->a + b->i[rstarts[rank]];
664   a_sendbuf = ad->a;
665   b_sendbuf = bd->a;
666   b_sendj   = bd->j;
667   n         = a->rend - a->rstart;
668   cnt       = 0;
669   for (i=0; i<n; i++) {
670 
671     /* put in lower diagonal portion */
672     m = bd->i[i+1] - bd->i[i];
673     while (m > 0) {
674       /* is it above diagonal (in bd (compressed) numbering) */
675       if (garray[*b_sendj] > a->rstart + i) break;
676       sendbuf[cnt++] = *b_sendbuf++;
677       m--;
678       b_sendj++;
679     }
680 
681     /* put in diagonal portion */
682     for (j=ad->i[i]; j<ad->i[i+1]; j++) {
683       sendbuf[cnt++] = *a_sendbuf++;
684     }
685 
686     /* put in upper diagonal portion */
687     while (m-- > 0) {
688       sendbuf[cnt++] = *b_sendbuf++;
689       b_sendj++;
690     }
691   }
692   if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
693 
694   /* -----------------------------------------------------------------
695      Gather all numerical values to all processors
696   */
697   if (!recvcounts) {
698     ierr   = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
699     displs = recvcounts + size;
700   }
701   for (i=0; i<size; i++) {
702     recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
703   }
704   displs[0]  = 0;
705   for (i=1; i<size; i++) {
706     displs[i] = displs[i-1] + recvcounts[i-1];
707   }
708   recvbuf   = b->a;
709   ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr);
710   ierr = PetscFree(recvcounts);CHKERRQ(ierr);
711 
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
717 int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat **submat)
718 {
719   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
720   PetscTruth  rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
721 
722   PetscFunctionBegin;
723   /*
724        Check for special case each processor gets entire matrix
725   */
726   if (ismax == 1 && C->M == C->N) {
727     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
728     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
729     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
730     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
731     if (rowflag && colflag && nrow == C->M && ncol == C->N) {
732       wantallmatrix = PETSC_TRUE;
733       ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
734     }
735   }
736   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr);
737   if (twantallmatrix) {
738     ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr);
739     PetscFunctionReturn(0);
740   }
741 
742   /* Allocate memory to hold all the submatrices */
743   if (scall != MAT_REUSE_MATRIX) {
744     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
745   }
746   /* Determine the number of stages through which submatrices are done */
747   nmax          = 20*1000000 / (C->N * sizeof(int));
748   if (!nmax) nmax = 1;
749   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
750 
751   /* Make sure every processor loops through the nstages */
752   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
753 
754   for (i=0,pos=0; i<nstages; i++) {
755     if (pos+nmax <= ismax) max_no = nmax;
756     else if (pos == ismax) max_no = 0;
757     else                   max_no = ismax-pos;
758     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
759     pos += max_no;
760   }
761   PetscFunctionReturn(0);
762 }
763 /* -------------------------------------------------------------------------*/
764 #undef __FUNCT__
765 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
766 int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,IS *isrow,IS *iscol,MatReuse scall,Mat *submats)
767 {
768   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
769   Mat         A = c->A;
770   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
771   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
772   int         **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
773   int         nrqs,msz,**ptr,index,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
774   int         **rbuf3,*req_source,**sbuf_aj,ashift,**rbuf2,max1,max2,**rmap;
775   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
776   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
777   int         *rmap_i,tag0,tag1,tag2,tag3;
778   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
779   MPI_Request *r_waits4,*s_waits3,*s_waits4;
780   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
781   MPI_Status  *r_status3,*r_status4,*s_status4;
782   MPI_Comm    comm;
783   PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
784   PetscTruth  sorted;
785   int         *onodes1,*olengths1;
786 
787   PetscFunctionBegin;
788   comm   = C->comm;
789   tag0   = C->tag;
790   size   = c->size;
791   rank   = c->rank;
792   m      = C->M;
793   ashift = a->indexshift;
794 
795   /* Get some new tags to keep the communication clean */
796   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
797   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
798   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
799 
800     /* Check if the col indices are sorted */
801   for (i=0; i<ismax; i++) {
802     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
803     if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
804     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
805     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
806   }
807 
808   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
809   ierr   = PetscMalloc(len,&irow);CHKERRQ(ierr);
810   icol   = irow + ismax;
811   nrow   = (int*)(icol + ismax);
812   ncol   = nrow + ismax;
813   rtable = ncol + ismax;
814 
815   for (i=0; i<ismax; i++) {
816     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
817     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
818     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
819     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
820   }
821 
822   /* Create hash table for the mapping :row -> proc*/
823   for (i=0,j=0; i<size; i++) {
824     jmax = c->rowners[i+1];
825     for (; j<jmax; j++) {
826       rtable[j] = i;
827     }
828   }
829 
830   /* evaluate communication - mesg to who, length of mesg, and buffer space
831      required. Based on this, buffers are allocated, and data copied into them*/
832   ierr   = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
833   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
834   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
835   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
836   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
837   for (i=0; i<ismax; i++) {
838     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
839     jmax   = nrow[i];
840     irow_i = irow[i];
841     for (j=0; j<jmax; j++) {
842       row  = irow_i[j];
843       proc = rtable[row];
844       w4[proc]++;
845     }
846     for (j=0; j<size; j++) {
847       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
848     }
849   }
850 
851   nrqs     = 0;              /* no of outgoing messages */
852   msz      = 0;              /* total mesg length (for all procs) */
853   w1[rank] = 0;              /* no mesg sent to self */
854   w3[rank] = 0;
855   for (i=0; i<size; i++) {
856     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
857   }
858   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
859   for (i=0,j=0; i<size; i++) {
860     if (w1[i]) { pa[j] = i; j++; }
861   }
862 
863   /* Each message would have a header = 1 + 2*(no of IS) + data */
864   for (i=0; i<nrqs; i++) {
865     j     = pa[i];
866     w1[j] += w2[j] + 2* w3[j];
867     msz   += w1[j];
868   }
869 
870   /* Determine the number of messages to expect, their lengths, from from-ids */
871   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
872   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
873 
874   /* Now post the Irecvs corresponding to these messages */
875   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
876 
877   ierr = PetscFree(onodes1);CHKERRQ(ierr);
878   ierr = PetscFree(olengths1);CHKERRQ(ierr);
879 
880   /* Allocate Memory for outgoing messages */
881   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
882   ierr     = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
883   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
884   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
885   /* allocate memory for outgoing data + buf to receive the first reply */
886   tmp      = (int*)(ptr + size);
887   ctr      = tmp + 2*msz;
888 
889   {
890     int *iptr = tmp,ict = 0;
891     for (i=0; i<nrqs; i++) {
892       j         = pa[i];
893       iptr     += ict;
894       sbuf1[j]  = iptr;
895       ict       = w1[j];
896     }
897   }
898 
899   /* Form the outgoing messages */
900   /* Initialize the header space */
901   for (i=0; i<nrqs; i++) {
902     j           = pa[i];
903     sbuf1[j][0] = 0;
904     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
905     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
906   }
907 
908   /* Parse the isrow and copy data into outbuf */
909   for (i=0; i<ismax; i++) {
910     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
911     irow_i = irow[i];
912     jmax   = nrow[i];
913     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
914       row  = irow_i[j];
915       proc = rtable[row];
916       if (proc != rank) { /* copy to the outgoing buf*/
917         ctr[proc]++;
918         *ptr[proc] = row;
919         ptr[proc]++;
920       }
921     }
922     /* Update the headers for the current IS */
923     for (j=0; j<size; j++) { /* Can Optimise this loop too */
924       if ((ctr_j = ctr[j])) {
925         sbuf1_j        = sbuf1[j];
926         k              = ++sbuf1_j[0];
927         sbuf1_j[2*k]   = ctr_j;
928         sbuf1_j[2*k-1] = i;
929       }
930     }
931   }
932 
933   /*  Now  post the sends */
934   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
935   for (i=0; i<nrqs; ++i) {
936     j    = pa[i];
937     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
938   }
939 
940   /* Post Receives to capture the buffer size */
941   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
942   ierr     = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr);
943   rbuf2[0] = tmp + msz;
944   for (i=1; i<nrqs; ++i) {
945     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
946   }
947   for (i=0; i<nrqs; ++i) {
948     j    = pa[i];
949     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
950   }
951 
952   /* Send to other procs the buf size they should allocate */
953 
954 
955   /* Receive messages*/
956   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
957   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
958   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
959   ierr        = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
960   req_size    = (int*)(sbuf2 + nrqr);
961   req_source  = req_size + nrqr;
962 
963   {
964     Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
965     int        *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
966     int        *sbuf2_i;
967 
968     for (i=0; i<nrqr; ++i) {
969       ierr = MPI_Waitany(nrqr,r_waits1,&index,r_status1+i);CHKERRQ(ierr);
970       req_size[index] = 0;
971       rbuf1_i         = rbuf1[index];
972       start           = 2*rbuf1_i[0] + 1;
973       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
974       ierr            = PetscMalloc((end+1)*sizeof(int),&sbuf2[index]);CHKERRQ(ierr);
975       sbuf2_i         = sbuf2[index];
976       for (j=start; j<end; j++) {
977         id               = rbuf1_i[j] - rstart;
978         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
979         sbuf2_i[j]       = ncols;
980         req_size[index] += ncols;
981       }
982       req_source[index] = r_status1[i].MPI_SOURCE;
983       /* form the header */
984       sbuf2_i[0]   = req_size[index];
985       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
986       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[index],tag1,comm,s_waits2+i);CHKERRQ(ierr);
987     }
988   }
989   ierr = PetscFree(r_status1);CHKERRQ(ierr);
990   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
991 
992   /*  recv buffer sizes */
993   /* Receive messages*/
994 
995   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
996   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
997   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
998   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
999   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
1000 
1001   for (i=0; i<nrqs; ++i) {
1002     ierr = MPI_Waitany(nrqs,r_waits2,&index,r_status2+i);CHKERRQ(ierr);
1003     ierr = PetscMalloc((rbuf2[index][0]+1)*sizeof(int),&rbuf3[index]);CHKERRQ(ierr);
1004     ierr = PetscMalloc((rbuf2[index][0]+1)*sizeof(PetscScalar),&rbuf4[index]);CHKERRQ(ierr);
1005     ierr = MPI_Irecv(rbuf3[index],rbuf2[index][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+index);CHKERRQ(ierr);
1006     ierr = MPI_Irecv(rbuf4[index],rbuf2[index][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+index);CHKERRQ(ierr);
1007   }
1008   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1009   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1010 
1011   /* Wait on sends1 and sends2 */
1012   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1013   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1014 
1015   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1016   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1017   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1018   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1019   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1020   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1021 
1022   /* Now allocate buffers for a->j, and send them off */
1023   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr);
1024   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1025   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
1026   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1027 
1028   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1029   {
1030     int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1031     int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1032     int *a_j = a->j,*b_j = b->j,shift = a->indexshift,ctmp,*t_cols;
1033 
1034     for (i=0; i<nrqr; i++) {
1035       rbuf1_i   = rbuf1[i];
1036       sbuf_aj_i = sbuf_aj[i];
1037       ct1       = 2*rbuf1_i[0] + 1;
1038       ct2       = 0;
1039       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1040         kmax = rbuf1[i][2*j];
1041         for (k=0; k<kmax; k++,ct1++) {
1042           row    = rbuf1_i[ct1] - rstart;
1043           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1044           ncols  = nzA + nzB;
1045           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1046 
1047           /* load the column indices for this row into cols*/
1048           cols  = sbuf_aj_i + ct2;
1049           if (!shift) {
1050             for (l=0; l<nzB; l++) {
1051               if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1052               else break;
1053             }
1054             imark = l;
1055             for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1056             for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1057           } else {
1058             ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);CHKERRQ(ierr);
1059             ierr = PetscMemcpy(cols,t_cols,ncols*sizeof(int));CHKERRQ(ierr);
1060             ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,&t_cols,0);CHKERRQ(ierr);
1061           }
1062 
1063           ct2 += ncols;
1064         }
1065       }
1066       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1067     }
1068   }
1069   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1070   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1071 
1072   /* Allocate buffers for a->a, and send them off */
1073   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1074   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1075   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1076   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1077 
1078   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1079   {
1080     int    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1081     int    cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1082     int    *b_j = b->j,shift = a->indexshift;
1083     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a,*t_vals;
1084 
1085     for (i=0; i<nrqr; i++) {
1086       rbuf1_i   = rbuf1[i];
1087       sbuf_aa_i = sbuf_aa[i];
1088       ct1       = 2*rbuf1_i[0]+1;
1089       ct2       = 0;
1090       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1091         kmax = rbuf1_i[2*j];
1092         for (k=0; k<kmax; k++,ct1++) {
1093           row    = rbuf1_i[ct1] - rstart;
1094           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1095           ncols  = nzA + nzB;
1096           cworkB = b_j + b_i[row];
1097           vworkA = a_a + a_i[row];
1098           vworkB = b_a + b_i[row];
1099 
1100           /* load the column values for this row into vals*/
1101           vals  = sbuf_aa_i+ct2;
1102           if (!shift) {
1103             for (l=0; l<nzB; l++) {
1104               if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
1105               else break;
1106             }
1107             imark = l;
1108             for (l=0; l<nzA; l++)   vals[imark+l] = vworkA[l];
1109             for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1110           } else {
1111             ierr = MatGetRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);CHKERRQ(ierr);
1112             ierr = PetscMemcpy(vals,t_vals,ncols*sizeof(PetscScalar));CHKERRQ(ierr);
1113             ierr = MatRestoreRow_MPIAIJ(C,rbuf1_i[ct1],&ncols,0,&t_vals);CHKERRQ(ierr);
1114           }
1115           ct2 += ncols;
1116         }
1117       }
1118       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1119     }
1120   }
1121   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1122   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1123   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1124 
1125   /* Form the matrix */
1126   /* create col map */
1127   {
1128     int *icol_i;
1129 
1130     len     = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1131     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
1132     cmap[0] = (int *)(cmap + ismax);
1133     ierr    = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr);
1134     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1135     for (i=0; i<ismax; i++) {
1136       jmax   = ncol[i];
1137       icol_i = icol[i];
1138       cmap_i = cmap[i];
1139       for (j=0; j<jmax; j++) {
1140         cmap_i[icol_i[j]] = j+1;
1141       }
1142     }
1143   }
1144 
1145   /* Create lens which is required for MatCreate... */
1146   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1147   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1148   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1149   lens[0] = (int *)(lens + ismax);
1150   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1151   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1152 
1153   /* Update lens from local data */
1154   for (i=0; i<ismax; i++) {
1155     jmax   = nrow[i];
1156     cmap_i = cmap[i];
1157     irow_i = irow[i];
1158     lens_i = lens[i];
1159     for (j=0; j<jmax; j++) {
1160       row  = irow_i[j];
1161       proc = rtable[row];
1162       if (proc == rank) {
1163         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1164         for (k=0; k<ncols; k++) {
1165           if (cmap_i[cols[k]]) { lens_i[j]++;}
1166         }
1167         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1168       }
1169     }
1170   }
1171 
1172   /* Create row map*/
1173   len     = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1174   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1175   rmap[0] = (int *)(rmap + ismax);
1176   ierr    = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr);
1177   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1178   for (i=0; i<ismax; i++) {
1179     rmap_i = rmap[i];
1180     irow_i = irow[i];
1181     jmax   = nrow[i];
1182     for (j=0; j<jmax; j++) {
1183       rmap_i[irow_i[j]] = j;
1184     }
1185   }
1186 
1187   /* Update lens from offproc data */
1188   {
1189     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1190 
1191     for (tmp2=0; tmp2<nrqs; tmp2++) {
1192       ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1193       index   = pa[i];
1194       sbuf1_i = sbuf1[index];
1195       jmax    = sbuf1_i[0];
1196       ct1     = 2*jmax+1;
1197       ct2     = 0;
1198       rbuf2_i = rbuf2[i];
1199       rbuf3_i = rbuf3[i];
1200       for (j=1; j<=jmax; j++) {
1201         is_no   = sbuf1_i[2*j-1];
1202         max1    = sbuf1_i[2*j];
1203         lens_i  = lens[is_no];
1204         cmap_i  = cmap[is_no];
1205         rmap_i  = rmap[is_no];
1206         for (k=0; k<max1; k++,ct1++) {
1207           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1208           max2 = rbuf2_i[ct1];
1209           for (l=0; l<max2; l++,ct2++) {
1210             if (cmap_i[rbuf3_i[ct2]]) {
1211               lens_i[row]++;
1212             }
1213           }
1214         }
1215       }
1216     }
1217   }
1218   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1219   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1220   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1221   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1222   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1223 
1224   /* Create the submatrices */
1225   if (scall == MAT_REUSE_MATRIX) {
1226     PetscTruth flag;
1227 
1228     /*
1229         Assumes new rows are same length as the old rows,hence bug!
1230     */
1231     for (i=0; i<ismax; i++) {
1232       mat = (Mat_SeqAIJ *)(submats[i]->data);
1233       if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1234         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1235       }
1236       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr);
1237       if (flag == PETSC_FALSE) {
1238         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1239       }
1240       /* Initial matrix as if empty */
1241       ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr);
1242       submats[i]->factor = C->factor;
1243     }
1244   } else {
1245     for (i=0; i<ismax; i++) {
1246       ierr = MatCreateSeqAIJ(PETSC_COMM_SELF,nrow[i],ncol[i],0,lens[i],submats+i);CHKERRQ(ierr);
1247     }
1248   }
1249 
1250   /* Assemble the matrices */
1251   /* First assemble the local rows */
1252   {
1253     int    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1254     PetscScalar *imat_a;
1255 
1256     for (i=0; i<ismax; i++) {
1257       mat       = (Mat_SeqAIJ*)submats[i]->data;
1258       imat_ilen = mat->ilen;
1259       imat_j    = mat->j;
1260       imat_i    = mat->i;
1261       imat_a    = mat->a;
1262       cmap_i    = cmap[i];
1263       rmap_i    = rmap[i];
1264       irow_i    = irow[i];
1265       jmax      = nrow[i];
1266       for (j=0; j<jmax; j++) {
1267         row      = irow_i[j];
1268         proc     = rtable[row];
1269         if (proc == rank) {
1270           old_row  = row;
1271           row      = rmap_i[row];
1272           ilen_row = imat_ilen[row];
1273           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1274           mat_i    = imat_i[row] + ashift;
1275           mat_a    = imat_a + mat_i;
1276           mat_j    = imat_j + mat_i;
1277           for (k=0; k<ncols; k++) {
1278             if ((tcol = cmap_i[cols[k]])) {
1279               *mat_j++ = tcol - (!ashift);
1280               *mat_a++ = vals[k];
1281               ilen_row++;
1282             }
1283           }
1284           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1285           imat_ilen[row] = ilen_row;
1286         }
1287       }
1288     }
1289   }
1290 
1291   /*   Now assemble the off proc rows*/
1292   {
1293     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1294     int    *imat_j,*imat_i;
1295     PetscScalar *imat_a,*rbuf4_i;
1296 
1297     for (tmp2=0; tmp2<nrqs; tmp2++) {
1298       ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1299       index   = pa[i];
1300       sbuf1_i = sbuf1[index];
1301       jmax    = sbuf1_i[0];
1302       ct1     = 2*jmax + 1;
1303       ct2     = 0;
1304       rbuf2_i = rbuf2[i];
1305       rbuf3_i = rbuf3[i];
1306       rbuf4_i = rbuf4[i];
1307       for (j=1; j<=jmax; j++) {
1308         is_no     = sbuf1_i[2*j-1];
1309         rmap_i    = rmap[is_no];
1310         cmap_i    = cmap[is_no];
1311         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1312         imat_ilen = mat->ilen;
1313         imat_j    = mat->j;
1314         imat_i    = mat->i;
1315         imat_a    = mat->a;
1316         max1      = sbuf1_i[2*j];
1317         for (k=0; k<max1; k++,ct1++) {
1318           row   = sbuf1_i[ct1];
1319           row   = rmap_i[row];
1320           ilen  = imat_ilen[row];
1321           mat_i = imat_i[row] + ashift;
1322           mat_a = imat_a + mat_i;
1323           mat_j = imat_j + mat_i;
1324           max2 = rbuf2_i[ct1];
1325           for (l=0; l<max2; l++,ct2++) {
1326             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1327               *mat_j++ = tcol - (!ashift);
1328               *mat_a++ = rbuf4_i[ct2];
1329               ilen++;
1330             }
1331           }
1332           imat_ilen[row] = ilen;
1333         }
1334       }
1335     }
1336   }
1337   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1338   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1339   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1340   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1341   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1342 
1343   /* Restore the indices */
1344   for (i=0; i<ismax; i++) {
1345     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1346     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1347   }
1348 
1349   /* Destroy allocated memory */
1350   ierr = PetscFree(irow);CHKERRQ(ierr);
1351   ierr = PetscFree(w1);CHKERRQ(ierr);
1352   ierr = PetscFree(pa);CHKERRQ(ierr);
1353 
1354   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1355   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1356   for (i=0; i<nrqr; ++i) {
1357     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1358   }
1359   for (i=0; i<nrqs; ++i) {
1360     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1361     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1362   }
1363 
1364   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1365   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1366   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1367   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1368   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1369   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1370   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1371 
1372   ierr = PetscFree(cmap);CHKERRQ(ierr);
1373   ierr = PetscFree(rmap);CHKERRQ(ierr);
1374   ierr = PetscFree(lens);CHKERRQ(ierr);
1375 
1376   for (i=0; i<ismax; i++) {
1377     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1378     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1379   }
1380   PetscFunctionReturn(0);
1381 }
1382 
1383