xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 4482741e5b2e2bbc854fb1f8dba65221386520f2)
1 /*$Id: mpiov.c,v 1.26.1.76.2.22 2001/09/07 20:09:38 bsmith Exp $*/
2 /*
3    Routines to compute overlapping regions of a parallel MPI matrix
4   and to find submatrices that were shared across processors.
5 */
6 #include "src/mat/impls/aij/mpi/mpiaij.h"
7 #include "petscbt.h"
8 
9 static int MatIncreaseOverlap_MPIAIJ_Once(Mat,int,IS *);
10 static int MatIncreaseOverlap_MPIAIJ_Local(Mat,int,char **,int*,int**);
11 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat,int,int **,int**,int*);
12 EXTERN int MatGetRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
13 EXTERN int MatRestoreRow_MPIAIJ(Mat,int,int*,int**,PetscScalar**);
14 
15 #undef __FUNCT__
16 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ"
17 int MatIncreaseOverlap_MPIAIJ(Mat C,int imax,IS is[],int ov)
18 {
19   int i,ierr;
20 
21   PetscFunctionBegin;
22   if (ov < 0) SETERRQ(PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
23   for (i=0; i<ov; ++i) {
24     ierr = MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);CHKERRQ(ierr);
25   }
26   PetscFunctionReturn(0);
27 }
28 
29 /*
30   Sample message format:
31   If a processor A wants processor B to process some elements corresponding
32   to index sets is[1],is[5]
33   mesg [0] = 2   (no of index sets in the mesg)
34   -----------
35   mesg [1] = 1 => is[1]
36   mesg [2] = sizeof(is[1]);
37   -----------
38   mesg [3] = 5  => is[5]
39   mesg [4] = sizeof(is[5]);
40   -----------
41   mesg [5]
42   mesg [n]  datas[1]
43   -----------
44   mesg[n+1]
45   mesg[m]  data(is[5])
46   -----------
47 
48   Notes:
49   nrqs - no of requests sent (or to be sent out)
50   nrqr - no of requests recieved (which have to be or which have been processed
51 */
52 #undef __FUNCT__
53 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Once"
54 static int MatIncreaseOverlap_MPIAIJ_Once(Mat C,int imax,IS is[])
55 {
56   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
57   int         **idx,*n,*w1,*w2,*w3,*w4,*rtable,**data,len,*idx_i;
58   int         size,rank,m,i,j,k,ierr,**rbuf,row,proc,nrqs,msz,**outdat,**ptr;
59   int         *ctr,*pa,*tmp,nrqr,*isz,*isz1,**xdata,**rbuf2;
60   int         *onodes1,*olengths1,tag1,tag2,*onodes2,*olengths2;
61   PetscBT     *table;
62   MPI_Comm    comm;
63   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
64   MPI_Status  *s_status,*recv_status;
65 
66   PetscFunctionBegin;
67   comm   = C->comm;
68   size   = c->size;
69   rank   = c->rank;
70   m      = C->M;
71 
72   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
73   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
74 
75   len    = (imax+1)*sizeof(int*)+ (imax + m)*sizeof(int);
76   ierr   = PetscMalloc(len,&idx);CHKERRQ(ierr);
77   n      = (int*)(idx + imax);
78   rtable = n + imax;
79 
80   for (i=0; i<imax; i++) {
81     ierr = ISGetIndices(is[i],&idx[i]);CHKERRQ(ierr);
82     ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr);
83   }
84 
85   /* Create hash table for the mapping :row -> proc*/
86   for (i=0,j=0; i<size; i++) {
87     len = c->rowners[i+1];
88     for (; j<len; j++) {
89       rtable[j] = i;
90     }
91   }
92 
93   /* evaluate communication - mesg to who,length of mesg, and buffer space
94      required. Based on this, buffers are allocated, and data copied into them*/
95   ierr = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr);/*  mesg size */
96   w2   = w1 + size;       /* if w2[i] marked, then a message to proc i*/
97   w3   = w2 + size;       /* no of IS that needs to be sent to proc i */
98   w4   = w3 + size;       /* temp work space used in determining w1, w2, w3 */
99   ierr = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
100   for (i=0; i<imax; i++) {
101     ierr  = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialise work vector*/
102     idx_i = idx[i];
103     len   = n[i];
104     for (j=0; j<len; j++) {
105       row  = idx_i[j];
106       if (row < 0) {
107         SETERRQ(1,"Index set cannot have negative entries");
108       }
109       proc = rtable[row];
110       w4[proc]++;
111     }
112     for (j=0; j<size; j++){
113       if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
114     }
115   }
116 
117   nrqs     = 0;              /* no of outgoing messages */
118   msz      = 0;              /* total mesg length (for all proc */
119   w1[rank] = 0;              /* no mesg sent to intself */
120   w3[rank] = 0;
121   for (i=0; i<size; i++) {
122     if (w1[i])  {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
123   }
124   /* pa - is list of processors to communicate with */
125   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr);
126   for (i=0,j=0; i<size; i++) {
127     if (w1[i]) {pa[j] = i; j++;}
128   }
129 
130   /* Each message would have a header = 1 + 2*(no of IS) + data */
131   for (i=0; i<nrqs; i++) {
132     j      = pa[i];
133     w1[j] += w2[j] + 2*w3[j];
134     msz   += w1[j];
135   }
136 
137   /* Determine the number of messages to expect, their lengths, from from-ids */
138   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
139   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
140 
141   /* Now post the Irecvs corresponding to these messages */
142   ierr = PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);CHKERRQ(ierr);
143 
144   /* Allocate Memory for outgoing messages */
145   len  = 2*size*sizeof(int*) + (size+msz)*sizeof(int);
146   ierr = PetscMalloc(len,&outdat);CHKERRQ(ierr);
147   ptr  = outdat + size;     /* Pointers to the data in outgoing buffers */
148   ierr = PetscMemzero(outdat,2*size*sizeof(int*));CHKERRQ(ierr);
149   tmp  = (int*)(outdat + 2*size);
150   ctr  = tmp + msz;
151 
152   {
153     int *iptr = tmp,ict  = 0;
154     for (i=0; i<nrqs; i++) {
155       j         = pa[i];
156       iptr     +=  ict;
157       outdat[j] = iptr;
158       ict       = w1[j];
159     }
160   }
161 
162   /* Form the outgoing messages */
163   /*plug in the headers*/
164   for (i=0; i<nrqs; i++) {
165     j            = pa[i];
166     outdat[j][0] = 0;
167     ierr         = PetscMemzero(outdat[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
168     ptr[j]       = outdat[j] + 2*w3[j] + 1;
169   }
170 
171   /* Memory for doing local proc's work*/
172   {
173     int  *d_p;
174     char *t_p;
175 
176     len   = (imax)*(sizeof(PetscBT) + sizeof(int*)+ sizeof(int)) +
177       (m)*imax*sizeof(int)  + (m/PETSC_BITS_PER_BYTE+1)*imax*sizeof(char) + 1;
178     ierr  = PetscMalloc(len,&table);CHKERRQ(ierr);
179     ierr  = PetscMemzero(table,len);CHKERRQ(ierr);
180     data  = (int **)(table + imax);
181     isz   = (int  *)(data  + imax);
182     d_p   = (int  *)(isz   + imax);
183     t_p   = (char *)(d_p   + m*imax);
184     for (i=0; i<imax; i++) {
185       table[i] = t_p + (m/PETSC_BITS_PER_BYTE+1)*i;
186       data[i]  = d_p + (m)*i;
187     }
188   }
189 
190   /* Parse the IS and update local tables and the outgoing buf with the data*/
191   {
192     int     n_i,*data_i,isz_i,*outdat_j,ctr_j;
193     PetscBT table_i;
194 
195     for (i=0; i<imax; i++) {
196       ierr    = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
197       n_i     = n[i];
198       table_i = table[i];
199       idx_i   = idx[i];
200       data_i  = data[i];
201       isz_i   = isz[i];
202       for (j=0;  j<n_i; j++) {  /* parse the indices of each IS */
203         row  = idx_i[j];
204         proc = rtable[row];
205         if (proc != rank) { /* copy to the outgoing buffer */
206           ctr[proc]++;
207           *ptr[proc] = row;
208           ptr[proc]++;
209         } else { /* Update the local table */
210           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
211         }
212       }
213       /* Update the headers for the current IS */
214       for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
215         if ((ctr_j = ctr[j])) {
216           outdat_j        = outdat[j];
217           k               = ++outdat_j[0];
218           outdat_j[2*k]   = ctr_j;
219           outdat_j[2*k-1] = i;
220         }
221       }
222       isz[i] = isz_i;
223     }
224   }
225 
226 
227 
228   /*  Now  post the sends */
229   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
230   for (i=0; i<nrqs; ++i) {
231     j    = pa[i];
232     ierr = MPI_Isend(outdat[j],w1[j],MPI_INT,j,tag1,comm,s_waits1+i);CHKERRQ(ierr);
233   }
234 
235   /* No longer need the original indices*/
236   for (i=0; i<imax; ++i) {
237     ierr = ISRestoreIndices(is[i],idx+i);CHKERRQ(ierr);
238   }
239   ierr = PetscFree(idx);CHKERRQ(ierr);
240 
241   for (i=0; i<imax; ++i) {
242     ierr = ISDestroy(is[i]);CHKERRQ(ierr);
243   }
244 
245   /* Do Local work*/
246   ierr = MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data);CHKERRQ(ierr);
247 
248   /* Receive messages*/
249   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&recv_status);CHKERRQ(ierr);
250   ierr = MPI_Waitall(nrqr,r_waits1,recv_status);CHKERRQ(ierr);
251 
252   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status);CHKERRQ(ierr);
253   ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);
254 
255   /* Phase 1 sends are complete - deallocate buffers */
256   ierr = PetscFree(outdat);CHKERRQ(ierr);
257   ierr = PetscFree(w1);CHKERRQ(ierr);
258 
259   ierr = PetscMalloc((nrqr+1)*sizeof(int *),&xdata);CHKERRQ(ierr);
260   ierr = PetscMalloc((nrqr+1)*sizeof(int),&isz1);CHKERRQ(ierr);
261   ierr = MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);CHKERRQ(ierr);
262   ierr = PetscFree(rbuf);CHKERRQ(ierr);
263 
264 
265  /* Send the data back*/
266   /* Do a global reduction to know the buffer space req for incoming messages*/
267   {
268     int *rw1;
269 
270     ierr = PetscMalloc(size*sizeof(int),&rw1);CHKERRQ(ierr);
271     ierr = PetscMemzero(rw1,size*sizeof(int));CHKERRQ(ierr);
272 
273     for (i=0; i<nrqr; ++i) {
274       proc      = recv_status[i].MPI_SOURCE;
275       if (proc != onodes1[i]) SETERRQ(1,"MPI_SOURCE mismatch");
276       rw1[proc] = isz1[i];
277     }
278     ierr = PetscFree(onodes1);CHKERRQ(ierr);
279     ierr = PetscFree(olengths1);CHKERRQ(ierr);
280 
281     /* Determine the number of messages to expect, their lengths, from from-ids */
282     ierr = PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);CHKERRQ(ierr);
283     PetscFree(rw1);
284   }
285   /* Now post the Irecvs corresponding to these messages */
286   ierr = PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);CHKERRQ(ierr);
287 
288   /*  Now  post the sends */
289   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
290   for (i=0; i<nrqr; ++i) {
291     j    = recv_status[i].MPI_SOURCE;
292     ierr = MPI_Isend(xdata[i],isz1[i],MPI_INT,j,tag2,comm,s_waits2+i);CHKERRQ(ierr);
293   }
294 
295   /* receive work done on other processors*/
296   {
297     int         idex,is_no,ct1,max,*rbuf2_i,isz_i,*data_i,jmax;
298     PetscBT     table_i;
299     MPI_Status  *status2;
300 
301     ierr = PetscMalloc((PetscMax(nrqr,nrqs)+1)*sizeof(MPI_Status),&status2);CHKERRQ(ierr);
302     for (i=0; i<nrqs; ++i) {
303       ierr = MPI_Waitany(nrqs,r_waits2,&idex,status2+i);CHKERRQ(ierr);
304       /* Process the message*/
305       rbuf2_i = rbuf2[idex];
306       ct1     = 2*rbuf2_i[0]+1;
307       jmax    = rbuf2[idex][0];
308       for (j=1; j<=jmax; j++) {
309         max     = rbuf2_i[2*j];
310         is_no   = rbuf2_i[2*j-1];
311         isz_i   = isz[is_no];
312         data_i  = data[is_no];
313         table_i = table[is_no];
314         for (k=0; k<max; k++,ct1++) {
315           row = rbuf2_i[ct1];
316           if (!PetscBTLookupSet(table_i,row)) { data_i[isz_i++] = row;}
317         }
318         isz[is_no] = isz_i;
319       }
320     }
321 
322     ierr = MPI_Waitall(nrqr,s_waits2,status2);CHKERRQ(ierr);
323     ierr = PetscFree(status2);CHKERRQ(ierr);
324   }
325 
326   for (i=0; i<imax; ++i) {
327     ierr = ISCreateGeneral(PETSC_COMM_SELF,isz[i],data[i],is+i);CHKERRQ(ierr);
328   }
329 
330   ierr = PetscFree(onodes2);CHKERRQ(ierr);
331   ierr = PetscFree(olengths2);CHKERRQ(ierr);
332 
333   ierr = PetscFree(pa);CHKERRQ(ierr);
334   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
335   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
336   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
337   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
338   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
339   ierr = PetscFree(table);CHKERRQ(ierr);
340   ierr = PetscFree(s_status);CHKERRQ(ierr);
341   ierr = PetscFree(recv_status);CHKERRQ(ierr);
342   ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
343   ierr = PetscFree(xdata);CHKERRQ(ierr);
344   ierr = PetscFree(isz1);CHKERRQ(ierr);
345   PetscFunctionReturn(0);
346 }
347 
348 #undef __FUNCT__
349 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Local"
350 /*
351    MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
352        the work on the local processor.
353 
354      Inputs:
355       C      - MAT_MPIAIJ;
356       imax - total no of index sets processed at a time;
357       table  - an array of char - size = m bits.
358 
359      Output:
360       isz    - array containing the count of the solution elements correspondign
361                to each index set;
362       data   - pointer to the solutions
363 */
364 static int MatIncreaseOverlap_MPIAIJ_Local(Mat C,int imax,PetscBT *table,int *isz,int **data)
365 {
366   Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
367   Mat        A = c->A,B = c->B;
368   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
369   int        start,end,val,max,rstart,cstart,*ai,*aj;
370   int        *bi,*bj,*garray,i,j,k,row,*data_i,isz_i;
371   PetscBT    table_i;
372 
373   PetscFunctionBegin;
374   rstart = c->rstart;
375   cstart = c->cstart;
376   ai     = a->i;
377   aj     = a->j;
378   bi     = b->i;
379   bj     = b->j;
380   garray = c->garray;
381 
382 
383   for (i=0; i<imax; i++) {
384     data_i  = data[i];
385     table_i = table[i];
386     isz_i   = isz[i];
387     for (j=0,max=isz[i]; j<max; j++) {
388       row   = data_i[j] - rstart;
389       start = ai[row];
390       end   = ai[row+1];
391       for (k=start; k<end; k++) { /* Amat */
392         val = aj[k] + cstart;
393         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
394       }
395       start = bi[row];
396       end   = bi[row+1];
397       for (k=start; k<end; k++) { /* Bmat */
398         val = garray[bj[k]];
399         if (!PetscBTLookupSet(table_i,val)) { data_i[isz_i++] = val;}
400       }
401     }
402     isz[i] = isz_i;
403   }
404   PetscFunctionReturn(0);
405 }
406 
407 #undef __FUNCT__
408 #define __FUNCT__ "MatIncreaseOverlap_MPIAIJ_Receive"
409 /*
410       MatIncreaseOverlap_MPIAIJ_Receive - Process the recieved messages,
411          and return the output
412 
413          Input:
414            C    - the matrix
415            nrqr - no of messages being processed.
416            rbuf - an array of pointers to the recieved requests
417 
418          Output:
419            xdata - array of messages to be sent back
420            isz1  - size of each message
421 
422   For better efficiency perhaps we should malloc seperately each xdata[i],
423 then if a remalloc is required we need only copy the data for that one row
424 rather then all previous rows as it is now where a single large chunck of
425 memory is used.
426 
427 */
428 static int MatIncreaseOverlap_MPIAIJ_Receive(Mat C,int nrqr,int **rbuf,int **xdata,int * isz1)
429 {
430   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
431   Mat         A = c->A,B = c->B;
432   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
433   int         rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
434   int         row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
435   int         val,max1,max2,rank,m,no_malloc =0,*tmp,new_estimate,ctr;
436   int         *rbuf_i,kmax,rbuf_0,ierr;
437   PetscBT     xtable;
438 
439   PetscFunctionBegin;
440   rank   = c->rank;
441   m      = C->M;
442   rstart = c->rstart;
443   cstart = c->cstart;
444   ai     = a->i;
445   aj     = a->j;
446   bi     = b->i;
447   bj     = b->j;
448   garray = c->garray;
449 
450 
451   for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
452     rbuf_i  =  rbuf[i];
453     rbuf_0  =  rbuf_i[0];
454     ct     += rbuf_0;
455     for (j=1; j<=rbuf_0; j++) { total_sz += rbuf_i[2*j]; }
456   }
457 
458   if (C->m) max1 = ct*(a->nz + b->nz)/C->m;
459   else      max1 = 1;
460   mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
461   ierr         = PetscMalloc(mem_estimate*sizeof(int),&xdata[0]);CHKERRQ(ierr);
462   ++no_malloc;
463   ierr         = PetscBTCreate(m,xtable);CHKERRQ(ierr);
464   ierr         = PetscMemzero(isz1,nrqr*sizeof(int));CHKERRQ(ierr);
465 
466   ct3 = 0;
467   for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
468     rbuf_i =  rbuf[i];
469     rbuf_0 =  rbuf_i[0];
470     ct1    =  2*rbuf_0+1;
471     ct2    =  ct1;
472     ct3    += ct1;
473     for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
474       ierr = PetscBTMemzero(m,xtable);CHKERRQ(ierr);
475       oct2 = ct2;
476       kmax = rbuf_i[2*j];
477       for (k=0; k<kmax; k++,ct1++) {
478         row = rbuf_i[ct1];
479         if (!PetscBTLookupSet(xtable,row)) {
480           if (!(ct3 < mem_estimate)) {
481             new_estimate = (int)(1.5*mem_estimate)+1;
482             ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
483             ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
484             ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
485             xdata[0]     = tmp;
486             mem_estimate = new_estimate; ++no_malloc;
487             for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
488           }
489           xdata[i][ct2++] = row;
490           ct3++;
491         }
492       }
493       for (k=oct2,max2=ct2; k<max2; k++) {
494         row   = xdata[i][k] - rstart;
495         start = ai[row];
496         end   = ai[row+1];
497         for (l=start; l<end; l++) {
498           val = aj[l] + cstart;
499           if (!PetscBTLookupSet(xtable,val)) {
500             if (!(ct3 < mem_estimate)) {
501               new_estimate = (int)(1.5*mem_estimate)+1;
502               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
503               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
504               ierr         = PetscFree(xdata[0]);CHKERRQ(ierr);
505               xdata[0]     = tmp;
506               mem_estimate = new_estimate; ++no_malloc;
507               for (ctr=1; ctr<=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
508             }
509             xdata[i][ct2++] = val;
510             ct3++;
511           }
512         }
513         start = bi[row];
514         end   = bi[row+1];
515         for (l=start; l<end; l++) {
516           val = garray[bj[l]];
517           if (!PetscBTLookupSet(xtable,val)) {
518             if (!(ct3 < mem_estimate)) {
519               new_estimate = (int)(1.5*mem_estimate)+1;
520               ierr         = PetscMalloc(new_estimate*sizeof(int),&tmp);CHKERRQ(ierr);
521               ierr         = PetscMemcpy(tmp,xdata[0],mem_estimate*sizeof(int));CHKERRQ(ierr);
522               ierr = PetscFree(xdata[0]);CHKERRQ(ierr);
523               xdata[0]     = tmp;
524               mem_estimate = new_estimate; ++no_malloc;
525               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
526             }
527             xdata[i][ct2++] = val;
528             ct3++;
529           }
530         }
531       }
532       /* Update the header*/
533       xdata[i][2*j]   = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
534       xdata[i][2*j-1] = rbuf_i[2*j-1];
535     }
536     xdata[i][0] = rbuf_0;
537     xdata[i+1]  = xdata[i] + ct2;
538     isz1[i]     = ct2; /* size of each message */
539   }
540   ierr = PetscBTDestroy(xtable);CHKERRQ(ierr);
541   PetscLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
542   PetscFunctionReturn(0);
543 }
544 /* -------------------------------------------------------------------------*/
545 EXTERN int MatGetSubMatrices_MPIAIJ_Local(Mat,int,const IS[],const IS[],MatReuse,Mat*);
546 EXTERN int MatAssemblyEnd_SeqAIJ(Mat,MatAssemblyType);
547 /*
548     Every processor gets the entire matrix
549 */
550 #undef __FUNCT__
551 #define __FUNCT__ "MatGetSubMatrix_MPIAIJ_All"
552 int MatGetSubMatrix_MPIAIJ_All(Mat A,MatReuse scall,Mat *Bin[])
553 {
554   Mat          B;
555   Mat_MPIAIJ   *a = (Mat_MPIAIJ *)A->data;
556   Mat_SeqAIJ   *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
557   int          ierr,sendcount,*recvcounts = 0,*displs = 0,size,i,*rstarts = a->rowners,rank,n,cnt,j;
558   int          m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
559   PetscScalar  *sendbuf,*recvbuf,*a_sendbuf,*b_sendbuf;
560 
561   PetscFunctionBegin;
562   ierr = MPI_Comm_size(A->comm,&size);CHKERRQ(ierr);
563   ierr = MPI_Comm_rank(A->comm,&rank);CHKERRQ(ierr);
564 
565   if (scall == MAT_INITIAL_MATRIX) {
566     /* ----------------------------------------------------------------
567          Tell every processor the number of nonzeros per row
568     */
569     ierr = PetscMalloc(A->M*sizeof(int),&lens);CHKERRQ(ierr);
570     for (i=a->rstart; i<a->rend; i++) {
571       lens[i] = ad->i[i-a->rstart+1] - ad->i[i-a->rstart] + bd->i[i-a->rstart+1] - bd->i[i-a->rstart];
572     }
573     sendcount = a->rend - a->rstart;
574     ierr = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
575     displs     = recvcounts + size;
576     for (i=0; i<size; i++) {
577       recvcounts[i] = a->rowners[i+1] - a->rowners[i];
578       displs[i]     = a->rowners[i];
579     }
580     ierr  = MPI_Allgatherv(lens+a->rstart,sendcount,MPI_INT,lens,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
581 
582     /* ---------------------------------------------------------------
583          Create the sequential matrix of the same type as the local block diagonal
584     */
585     ierr  = MatCreate(PETSC_COMM_SELF,A->M,A->N,PETSC_DETERMINE,PETSC_DETERMINE,&B);CHKERRQ(ierr);
586     ierr  = MatSetType(B,a->A->type_name);CHKERRQ(ierr);
587     ierr  = MatSeqAIJSetPreallocation(B,0,lens);CHKERRQ(ierr);
588     ierr  = PetscMalloc(sizeof(Mat),Bin);CHKERRQ(ierr);
589     **Bin = B;
590     b = (Mat_SeqAIJ *)B->data;
591 
592     /*--------------------------------------------------------------------
593        Copy my part of matrix column indices over
594     */
595     sendcount  = ad->nz + bd->nz;
596     jsendbuf   = b->j + b->i[rstarts[rank]];
597     a_jsendbuf = ad->j;
598     b_jsendbuf = bd->j;
599     n          = a->rend - a->rstart;
600     cnt        = 0;
601     for (i=0; i<n; i++) {
602 
603       /* put in lower diagonal portion */
604       m = bd->i[i+1] - bd->i[i];
605       while (m > 0) {
606         /* is it above diagonal (in bd (compressed) numbering) */
607         if (garray[*b_jsendbuf] > a->rstart + i) break;
608         jsendbuf[cnt++] = garray[*b_jsendbuf++];
609         m--;
610       }
611 
612       /* put in diagonal portion */
613       for (j=ad->i[i]; j<ad->i[i+1]; j++) {
614         jsendbuf[cnt++] = a->rstart + *a_jsendbuf++;
615       }
616 
617       /* put in upper diagonal portion */
618       while (m-- > 0) {
619         jsendbuf[cnt++] = garray[*b_jsendbuf++];
620       }
621     }
622     if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
623 
624     /*--------------------------------------------------------------------
625        Gather all column indices to all processors
626     */
627     for (i=0; i<size; i++) {
628       recvcounts[i] = 0;
629       for (j=a->rowners[i]; j<a->rowners[i+1]; j++) {
630         recvcounts[i] += lens[j];
631       }
632     }
633     displs[0]  = 0;
634     for (i=1; i<size; i++) {
635       displs[i] = displs[i-1] + recvcounts[i-1];
636     }
637     ierr = MPI_Allgatherv(jsendbuf,sendcount,MPI_INT,b->j,recvcounts,displs,MPI_INT,A->comm);CHKERRQ(ierr);
638 
639     /*--------------------------------------------------------------------
640         Assemble the matrix into useable form (note numerical values not yet set)
641     */
642     /* set the b->ilen (length of each row) values */
643     ierr = PetscMemcpy(b->ilen,lens,A->M*sizeof(int));CHKERRQ(ierr);
644     /* set the b->i indices */
645     b->i[0] = 0;
646     for (i=1; i<=A->M; i++) {
647       b->i[i] = b->i[i-1] + lens[i-1];
648     }
649     ierr = PetscFree(lens);CHKERRQ(ierr);
650     ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
651     ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
652   } else {
653     B  = **Bin;
654     b = (Mat_SeqAIJ *)B->data;
655   }
656 
657   /*--------------------------------------------------------------------
658        Copy my part of matrix numerical values into the values location
659   */
660   sendcount = ad->nz + bd->nz;
661   sendbuf   = b->a + b->i[rstarts[rank]];
662   a_sendbuf = ad->a;
663   b_sendbuf = bd->a;
664   b_sendj   = bd->j;
665   n         = a->rend - a->rstart;
666   cnt       = 0;
667   for (i=0; i<n; i++) {
668 
669     /* put in lower diagonal portion */
670     m = bd->i[i+1] - bd->i[i];
671     while (m > 0) {
672       /* is it above diagonal (in bd (compressed) numbering) */
673       if (garray[*b_sendj] > a->rstart + i) break;
674       sendbuf[cnt++] = *b_sendbuf++;
675       m--;
676       b_sendj++;
677     }
678 
679     /* put in diagonal portion */
680     for (j=ad->i[i]; j<ad->i[i+1]; j++) {
681       sendbuf[cnt++] = *a_sendbuf++;
682     }
683 
684     /* put in upper diagonal portion */
685     while (m-- > 0) {
686       sendbuf[cnt++] = *b_sendbuf++;
687       b_sendj++;
688     }
689   }
690   if (cnt != sendcount) SETERRQ2(1,"Corrupted PETSc matrix: nz given %d actual nz %d",sendcount,cnt);
691 
692   /* -----------------------------------------------------------------
693      Gather all numerical values to all processors
694   */
695   if (!recvcounts) {
696     ierr   = PetscMalloc(2*size*sizeof(int),&recvcounts);CHKERRQ(ierr);
697     displs = recvcounts + size;
698   }
699   for (i=0; i<size; i++) {
700     recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
701   }
702   displs[0]  = 0;
703   for (i=1; i<size; i++) {
704     displs[i] = displs[i-1] + recvcounts[i-1];
705   }
706   recvbuf   = b->a;
707   ierr = MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,A->comm);CHKERRQ(ierr);
708   ierr = PetscFree(recvcounts);CHKERRQ(ierr);
709 
710   PetscFunctionReturn(0);
711 }
712 
713 #undef __FUNCT__
714 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ"
715 int MatGetSubMatrices_MPIAIJ(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
716 {
717   int         nmax,nstages_local,nstages,i,pos,max_no,ierr,nrow,ncol;
718   PetscTruth  rowflag,colflag,wantallmatrix = PETSC_FALSE,twantallmatrix;
719 
720   PetscFunctionBegin;
721   /*
722        Check for special case each processor gets entire matrix
723   */
724   if (ismax == 1 && C->M == C->N) {
725     ierr = ISIdentity(*isrow,&rowflag);CHKERRQ(ierr);
726     ierr = ISIdentity(*iscol,&colflag);CHKERRQ(ierr);
727     ierr = ISGetLocalSize(*isrow,&nrow);CHKERRQ(ierr);
728     ierr = ISGetLocalSize(*iscol,&ncol);CHKERRQ(ierr);
729     if (rowflag && colflag && nrow == C->M && ncol == C->N) {
730       wantallmatrix = PETSC_TRUE;
731       ierr = PetscOptionsGetLogical(C->prefix,"-use_fast_submatrix",&wantallmatrix,PETSC_NULL);CHKERRQ(ierr);
732     }
733   }
734   ierr = MPI_Allreduce(&wantallmatrix,&twantallmatrix,1,MPI_INT,MPI_MIN,C->comm);CHKERRQ(ierr);
735   if (twantallmatrix) {
736     ierr = MatGetSubMatrix_MPIAIJ_All(C,scall,submat);CHKERRQ(ierr);
737     PetscFunctionReturn(0);
738   }
739 
740   /* Allocate memory to hold all the submatrices */
741   if (scall != MAT_REUSE_MATRIX) {
742     ierr = PetscMalloc((ismax+1)*sizeof(Mat),submat);CHKERRQ(ierr);
743   }
744   /* Determine the number of stages through which submatrices are done */
745   nmax          = 20*1000000 / (C->N * sizeof(int));
746   if (!nmax) nmax = 1;
747   nstages_local = ismax/nmax + ((ismax % nmax)?1:0);
748 
749   /* Make sure every processor loops through the nstages */
750   ierr = MPI_Allreduce(&nstages_local,&nstages,1,MPI_INT,MPI_MAX,C->comm);CHKERRQ(ierr);
751 
752   for (i=0,pos=0; i<nstages; i++) {
753     if (pos+nmax <= ismax) max_no = nmax;
754     else if (pos == ismax) max_no = 0;
755     else                   max_no = ismax-pos;
756     ierr = MatGetSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);CHKERRQ(ierr);
757     pos += max_no;
758   }
759   PetscFunctionReturn(0);
760 }
761 /* -------------------------------------------------------------------------*/
762 #undef __FUNCT__
763 #define __FUNCT__ "MatGetSubMatrices_MPIAIJ_Local"
764 int MatGetSubMatrices_MPIAIJ_Local(Mat C,int ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
765 {
766   Mat_MPIAIJ  *c = (Mat_MPIAIJ*)C->data;
767   Mat         A = c->A;
768   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*mat;
769   int         **irow,**icol,*nrow,*ncol,*w1,*w2,*w3,*w4,*rtable,start,end,size;
770   int         **sbuf1,**sbuf2,rank,m,i,j,k,l,ct1,ct2,ierr,**rbuf1,row,proc;
771   int         nrqs,msz,**ptr,idex,*req_size,*ctr,*pa,*tmp,tcol,nrqr;
772   int         **rbuf3,*req_source,**sbuf_aj,**rbuf2,max1,max2,**rmap;
773   int         **cmap,**lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax,*irow_i;
774   int         len,ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*cmap_i,*lens_i;
775   int         *rmap_i,tag0,tag1,tag2,tag3;
776   MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
777   MPI_Request *r_waits4,*s_waits3,*s_waits4;
778   MPI_Status  *r_status1,*r_status2,*s_status1,*s_status3,*s_status2;
779   MPI_Status  *r_status3,*r_status4,*s_status4;
780   MPI_Comm    comm;
781   PetscScalar **rbuf4,**sbuf_aa,*vals,*mat_a,*sbuf_aa_i;
782   PetscTruth  sorted;
783   int         *onodes1,*olengths1;
784 
785   PetscFunctionBegin;
786   comm   = C->comm;
787   tag0   = C->tag;
788   size   = c->size;
789   rank   = c->rank;
790   m      = C->M;
791 
792   /* Get some new tags to keep the communication clean */
793   ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr);
794   ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr);
795   ierr = PetscObjectGetNewTag((PetscObject)C,&tag3);CHKERRQ(ierr);
796 
797     /* Check if the col indices are sorted */
798   for (i=0; i<ismax; i++) {
799     ierr = ISSorted(isrow[i],&sorted);CHKERRQ(ierr);
800     if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"ISrow is not sorted");
801     ierr = ISSorted(iscol[i],&sorted);CHKERRQ(ierr);
802     /*    if (!sorted) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"IScol is not sorted"); */
803   }
804 
805   len    = (2*ismax+1)*(sizeof(int*)+ sizeof(int)) + (m+1)*sizeof(int);
806   ierr   = PetscMalloc(len,&irow);CHKERRQ(ierr);
807   icol   = irow + ismax;
808   nrow   = (int*)(icol + ismax);
809   ncol   = nrow + ismax;
810   rtable = ncol + ismax;
811 
812   for (i=0; i<ismax; i++) {
813     ierr = ISGetIndices(isrow[i],&irow[i]);CHKERRQ(ierr);
814     ierr = ISGetIndices(iscol[i],&icol[i]);CHKERRQ(ierr);
815     ierr = ISGetLocalSize(isrow[i],&nrow[i]);CHKERRQ(ierr);
816     ierr = ISGetLocalSize(iscol[i],&ncol[i]);CHKERRQ(ierr);
817   }
818 
819   /* Create hash table for the mapping :row -> proc*/
820   for (i=0,j=0; i<size; i++) {
821     jmax = c->rowners[i+1];
822     for (; j<jmax; j++) {
823       rtable[j] = i;
824     }
825   }
826 
827   /* evaluate communication - mesg to who, length of mesg, and buffer space
828      required. Based on this, buffers are allocated, and data copied into them*/
829   ierr   = PetscMalloc(size*4*sizeof(int),&w1);CHKERRQ(ierr); /* mesg size */
830   w2     = w1 + size;      /* if w2[i] marked, then a message to proc i*/
831   w3     = w2 + size;      /* no of IS that needs to be sent to proc i */
832   w4     = w3 + size;      /* temp work space used in determining w1, w2, w3 */
833   ierr   = PetscMemzero(w1,size*3*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
834   for (i=0; i<ismax; i++) {
835     ierr   = PetscMemzero(w4,size*sizeof(int));CHKERRQ(ierr); /* initialize work vector*/
836     jmax   = nrow[i];
837     irow_i = irow[i];
838     for (j=0; j<jmax; j++) {
839       row  = irow_i[j];
840       proc = rtable[row];
841       w4[proc]++;
842     }
843     for (j=0; j<size; j++) {
844       if (w4[j]) { w1[j] += w4[j];  w3[j]++;}
845     }
846   }
847 
848   nrqs     = 0;              /* no of outgoing messages */
849   msz      = 0;              /* total mesg length (for all procs) */
850   w1[rank] = 0;              /* no mesg sent to self */
851   w3[rank] = 0;
852   for (i=0; i<size; i++) {
853     if (w1[i])  { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
854   }
855   ierr = PetscMalloc((nrqs+1)*sizeof(int),&pa);CHKERRQ(ierr); /*(proc -array)*/
856   for (i=0,j=0; i<size; i++) {
857     if (w1[i]) { pa[j] = i; j++; }
858   }
859 
860   /* Each message would have a header = 1 + 2*(no of IS) + data */
861   for (i=0; i<nrqs; i++) {
862     j     = pa[i];
863     w1[j] += w2[j] + 2* w3[j];
864     msz   += w1[j];
865   }
866 
867   /* Determine the number of messages to expect, their lengths, from from-ids */
868   ierr = PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);CHKERRQ(ierr);
869   ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);CHKERRQ(ierr);
870 
871   /* Now post the Irecvs corresponding to these messages */
872   ierr = PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);CHKERRQ(ierr);
873 
874   ierr = PetscFree(onodes1);CHKERRQ(ierr);
875   ierr = PetscFree(olengths1);CHKERRQ(ierr);
876 
877   /* Allocate Memory for outgoing messages */
878   len      = 2*size*sizeof(int*) + 2*msz*sizeof(int) + size*sizeof(int);
879   ierr     = PetscMalloc(len,&sbuf1);CHKERRQ(ierr);
880   ptr      = sbuf1 + size;   /* Pointers to the data in outgoing buffers */
881   ierr     = PetscMemzero(sbuf1,2*size*sizeof(int*));CHKERRQ(ierr);
882   /* allocate memory for outgoing data + buf to receive the first reply */
883   tmp      = (int*)(ptr + size);
884   ctr      = tmp + 2*msz;
885 
886   {
887     int *iptr = tmp,ict = 0;
888     for (i=0; i<nrqs; i++) {
889       j         = pa[i];
890       iptr     += ict;
891       sbuf1[j]  = iptr;
892       ict       = w1[j];
893     }
894   }
895 
896   /* Form the outgoing messages */
897   /* Initialize the header space */
898   for (i=0; i<nrqs; i++) {
899     j           = pa[i];
900     sbuf1[j][0] = 0;
901     ierr        = PetscMemzero(sbuf1[j]+1,2*w3[j]*sizeof(int));CHKERRQ(ierr);
902     ptr[j]      = sbuf1[j] + 2*w3[j] + 1;
903   }
904 
905   /* Parse the isrow and copy data into outbuf */
906   for (i=0; i<ismax; i++) {
907     ierr   = PetscMemzero(ctr,size*sizeof(int));CHKERRQ(ierr);
908     irow_i = irow[i];
909     jmax   = nrow[i];
910     for (j=0; j<jmax; j++) {  /* parse the indices of each IS */
911       row  = irow_i[j];
912       proc = rtable[row];
913       if (proc != rank) { /* copy to the outgoing buf*/
914         ctr[proc]++;
915         *ptr[proc] = row;
916         ptr[proc]++;
917       }
918     }
919     /* Update the headers for the current IS */
920     for (j=0; j<size; j++) { /* Can Optimise this loop too */
921       if ((ctr_j = ctr[j])) {
922         sbuf1_j        = sbuf1[j];
923         k              = ++sbuf1_j[0];
924         sbuf1_j[2*k]   = ctr_j;
925         sbuf1_j[2*k-1] = i;
926       }
927     }
928   }
929 
930   /*  Now  post the sends */
931   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&s_waits1);CHKERRQ(ierr);
932   for (i=0; i<nrqs; ++i) {
933     j    = pa[i];
934     ierr = MPI_Isend(sbuf1[j],w1[j],MPI_INT,j,tag0,comm,s_waits1+i);CHKERRQ(ierr);
935   }
936 
937   /* Post Receives to capture the buffer size */
938   ierr     = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits2);CHKERRQ(ierr);
939   ierr     = PetscMalloc((nrqs+1)*sizeof(int *),&rbuf2);CHKERRQ(ierr);
940   rbuf2[0] = tmp + msz;
941   for (i=1; i<nrqs; ++i) {
942     rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
943   }
944   for (i=0; i<nrqs; ++i) {
945     j    = pa[i];
946     ierr = MPI_Irecv(rbuf2[i],w1[j],MPI_INT,j,tag1,comm,r_waits2+i);CHKERRQ(ierr);
947   }
948 
949   /* Send to other procs the buf size they should allocate */
950 
951 
952   /* Receive messages*/
953   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits2);CHKERRQ(ierr);
954   ierr        = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&r_status1);CHKERRQ(ierr);
955   len         = 2*nrqr*sizeof(int) + (nrqr+1)*sizeof(int*);
956   ierr        = PetscMalloc(len,&sbuf2);CHKERRQ(ierr);
957   req_size    = (int*)(sbuf2 + nrqr);
958   req_source  = req_size + nrqr;
959 
960   {
961     Mat_SeqAIJ *sA = (Mat_SeqAIJ*)c->A->data,*sB = (Mat_SeqAIJ*)c->B->data;
962     int        *sAi = sA->i,*sBi = sB->i,id,rstart = c->rstart;
963     int        *sbuf2_i;
964 
965     for (i=0; i<nrqr; ++i) {
966       ierr = MPI_Waitany(nrqr,r_waits1,&idex,r_status1+i);CHKERRQ(ierr);
967       req_size[idex] = 0;
968       rbuf1_i         = rbuf1[idex];
969       start           = 2*rbuf1_i[0] + 1;
970       ierr            = MPI_Get_count(r_status1+i,MPI_INT,&end);CHKERRQ(ierr);
971       ierr            = PetscMalloc((end+1)*sizeof(int),&sbuf2[idex]);CHKERRQ(ierr);
972       sbuf2_i         = sbuf2[idex];
973       for (j=start; j<end; j++) {
974         id               = rbuf1_i[j] - rstart;
975         ncols            = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
976         sbuf2_i[j]       = ncols;
977         req_size[idex] += ncols;
978       }
979       req_source[idex] = r_status1[i].MPI_SOURCE;
980       /* form the header */
981       sbuf2_i[0]   = req_size[idex];
982       for (j=1; j<start; j++) { sbuf2_i[j] = rbuf1_i[j]; }
983       ierr = MPI_Isend(sbuf2_i,end,MPI_INT,req_source[idex],tag1,comm,s_waits2+i);CHKERRQ(ierr);
984     }
985   }
986   ierr = PetscFree(r_status1);CHKERRQ(ierr);
987   ierr = PetscFree(r_waits1);CHKERRQ(ierr);
988 
989   /*  recv buffer sizes */
990   /* Receive messages*/
991 
992   ierr = PetscMalloc((nrqs+1)*sizeof(int*),&rbuf3);CHKERRQ(ierr);
993   ierr = PetscMalloc((nrqs+1)*sizeof(PetscScalar*),&rbuf4);CHKERRQ(ierr);
994   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits3);CHKERRQ(ierr);
995   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Request),&r_waits4);CHKERRQ(ierr);
996   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status2);CHKERRQ(ierr);
997 
998   for (i=0; i<nrqs; ++i) {
999     ierr = MPI_Waitany(nrqs,r_waits2,&idex,r_status2+i);CHKERRQ(ierr);
1000     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(int),&rbuf3[idex]);CHKERRQ(ierr);
1001     ierr = PetscMalloc((rbuf2[idex][0]+1)*sizeof(PetscScalar),&rbuf4[idex]);CHKERRQ(ierr);
1002     ierr = MPI_Irecv(rbuf3[idex],rbuf2[idex][0],MPI_INT,r_status2[i].MPI_SOURCE,tag2,comm,r_waits3+idex);CHKERRQ(ierr);
1003     ierr = MPI_Irecv(rbuf4[idex],rbuf2[idex][0],MPIU_SCALAR,r_status2[i].MPI_SOURCE,tag3,comm,r_waits4+idex);CHKERRQ(ierr);
1004   }
1005   ierr = PetscFree(r_status2);CHKERRQ(ierr);
1006   ierr = PetscFree(r_waits2);CHKERRQ(ierr);
1007 
1008   /* Wait on sends1 and sends2 */
1009   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&s_status1);CHKERRQ(ierr);
1010   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status2);CHKERRQ(ierr);
1011 
1012   ierr = MPI_Waitall(nrqs,s_waits1,s_status1);CHKERRQ(ierr);
1013   ierr = MPI_Waitall(nrqr,s_waits2,s_status2);CHKERRQ(ierr);
1014   ierr = PetscFree(s_status1);CHKERRQ(ierr);
1015   ierr = PetscFree(s_status2);CHKERRQ(ierr);
1016   ierr = PetscFree(s_waits1);CHKERRQ(ierr);
1017   ierr = PetscFree(s_waits2);CHKERRQ(ierr);
1018 
1019   /* Now allocate buffers for a->j, and send them off */
1020   ierr = PetscMalloc((nrqr+1)*sizeof(int*),&sbuf_aj);CHKERRQ(ierr);
1021   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1022   ierr = PetscMalloc((j+1)*sizeof(int),&sbuf_aj[0]);CHKERRQ(ierr);
1023   for (i=1; i<nrqr; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1024 
1025   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits3);CHKERRQ(ierr);
1026   {
1027     int nzA,nzB,*a_i = a->i,*b_i = b->i,imark;
1028     int *cworkA,*cworkB,cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1029     int *a_j = a->j,*b_j = b->j,ctmp;
1030 
1031     for (i=0; i<nrqr; i++) {
1032       rbuf1_i   = rbuf1[i];
1033       sbuf_aj_i = sbuf_aj[i];
1034       ct1       = 2*rbuf1_i[0] + 1;
1035       ct2       = 0;
1036       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1037         kmax = rbuf1[i][2*j];
1038         for (k=0; k<kmax; k++,ct1++) {
1039           row    = rbuf1_i[ct1] - rstart;
1040           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1041           ncols  = nzA + nzB;
1042           cworkA = a_j + a_i[row]; cworkB = b_j + b_i[row];
1043 
1044           /* load the column indices for this row into cols*/
1045           cols  = sbuf_aj_i + ct2;
1046 
1047           for (l=0; l<nzB; l++) {
1048             if ((ctmp = bmap[cworkB[l]]) < cstart)  cols[l] = ctmp;
1049             else break;
1050           }
1051           imark = l;
1052           for (l=0; l<nzA; l++)   cols[imark+l] = cstart + cworkA[l];
1053           for (l=imark; l<nzB; l++) cols[nzA+l] = bmap[cworkB[l]];
1054 
1055           ct2 += ncols;
1056         }
1057       }
1058       ierr = MPI_Isend(sbuf_aj_i,req_size[i],MPI_INT,req_source[i],tag2,comm,s_waits3+i);CHKERRQ(ierr);
1059     }
1060   }
1061   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status3);CHKERRQ(ierr);
1062   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status3);CHKERRQ(ierr);
1063 
1064   /* Allocate buffers for a->a, and send them off */
1065   ierr = PetscMalloc((nrqr+1)*sizeof(PetscScalar*),&sbuf_aa);CHKERRQ(ierr);
1066   for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1067   ierr = PetscMalloc((j+1)*sizeof(PetscScalar),&sbuf_aa[0]);CHKERRQ(ierr);
1068   for (i=1; i<nrqr; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1069 
1070   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Request),&s_waits4);CHKERRQ(ierr);
1071   {
1072     int    nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,imark;
1073     int    cstart = c->cstart,rstart = c->rstart,*bmap = c->garray;
1074     int    *b_j = b->j;
1075     PetscScalar *vworkA,*vworkB,*a_a = a->a,*b_a = b->a;
1076 
1077     for (i=0; i<nrqr; i++) {
1078       rbuf1_i   = rbuf1[i];
1079       sbuf_aa_i = sbuf_aa[i];
1080       ct1       = 2*rbuf1_i[0]+1;
1081       ct2       = 0;
1082       for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
1083         kmax = rbuf1_i[2*j];
1084         for (k=0; k<kmax; k++,ct1++) {
1085           row    = rbuf1_i[ct1] - rstart;
1086           nzA    = a_i[row+1] - a_i[row];     nzB = b_i[row+1] - b_i[row];
1087           ncols  = nzA + nzB;
1088           cworkB = b_j + b_i[row];
1089           vworkA = a_a + a_i[row];
1090           vworkB = b_a + b_i[row];
1091 
1092           /* load the column values for this row into vals*/
1093           vals  = sbuf_aa_i+ct2;
1094 
1095           for (l=0; l<nzB; l++) {
1096             if ((bmap[cworkB[l]]) < cstart)  vals[l] = vworkB[l];
1097             else break;
1098           }
1099           imark = l;
1100           for (l=0; l<nzA; l++)   vals[imark+l] = vworkA[l];
1101           for (l=imark; l<nzB; l++) vals[nzA+l] = vworkB[l];
1102 
1103           ct2 += ncols;
1104         }
1105       }
1106       ierr = MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source[i],tag3,comm,s_waits4+i);CHKERRQ(ierr);
1107     }
1108   }
1109   ierr = PetscMalloc((nrqs+1)*sizeof(MPI_Status),&r_status4);CHKERRQ(ierr);
1110   ierr = PetscMalloc((nrqr+1)*sizeof(MPI_Status),&s_status4);CHKERRQ(ierr);
1111   ierr = PetscFree(rbuf1);CHKERRQ(ierr);
1112 
1113   /* Form the matrix */
1114   /* create col map */
1115   {
1116     int *icol_i;
1117 
1118     len     = (1+ismax)*sizeof(int*)+ ismax*C->N*sizeof(int);
1119     ierr    = PetscMalloc(len,&cmap);CHKERRQ(ierr);
1120     cmap[0] = (int *)(cmap + ismax);
1121     ierr    = PetscMemzero(cmap[0],(1+ismax*C->N)*sizeof(int));CHKERRQ(ierr);
1122     for (i=1; i<ismax; i++) { cmap[i] = cmap[i-1] + C->N; }
1123     for (i=0; i<ismax; i++) {
1124       jmax   = ncol[i];
1125       icol_i = icol[i];
1126       cmap_i = cmap[i];
1127       for (j=0; j<jmax; j++) {
1128         cmap_i[icol_i[j]] = j+1;
1129       }
1130     }
1131   }
1132 
1133   /* Create lens which is required for MatCreate... */
1134   for (i=0,j=0; i<ismax; i++) { j += nrow[i]; }
1135   len     = (1+ismax)*sizeof(int*)+ j*sizeof(int);
1136   ierr    = PetscMalloc(len,&lens);CHKERRQ(ierr);
1137   lens[0] = (int *)(lens + ismax);
1138   ierr    = PetscMemzero(lens[0],j*sizeof(int));CHKERRQ(ierr);
1139   for (i=1; i<ismax; i++) { lens[i] = lens[i-1] + nrow[i-1]; }
1140 
1141   /* Update lens from local data */
1142   for (i=0; i<ismax; i++) {
1143     jmax   = nrow[i];
1144     cmap_i = cmap[i];
1145     irow_i = irow[i];
1146     lens_i = lens[i];
1147     for (j=0; j<jmax; j++) {
1148       row  = irow_i[j];
1149       proc = rtable[row];
1150       if (proc == rank) {
1151         ierr = MatGetRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1152         for (k=0; k<ncols; k++) {
1153           if (cmap_i[cols[k]]) { lens_i[j]++;}
1154         }
1155         ierr = MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,0);CHKERRQ(ierr);
1156       }
1157     }
1158   }
1159 
1160   /* Create row map*/
1161   len     = (1+ismax)*sizeof(int*)+ ismax*C->M*sizeof(int);
1162   ierr    = PetscMalloc(len,&rmap);CHKERRQ(ierr);
1163   rmap[0] = (int *)(rmap + ismax);
1164   ierr    = PetscMemzero(rmap[0],ismax*C->M*sizeof(int));CHKERRQ(ierr);
1165   for (i=1; i<ismax; i++) { rmap[i] = rmap[i-1] + C->M;}
1166   for (i=0; i<ismax; i++) {
1167     rmap_i = rmap[i];
1168     irow_i = irow[i];
1169     jmax   = nrow[i];
1170     for (j=0; j<jmax; j++) {
1171       rmap_i[irow_i[j]] = j;
1172     }
1173   }
1174 
1175   /* Update lens from offproc data */
1176   {
1177     int *rbuf2_i,*rbuf3_i,*sbuf1_i;
1178 
1179     for (tmp2=0; tmp2<nrqs; tmp2++) {
1180       ierr = MPI_Waitany(nrqs,r_waits3,&i,r_status3+tmp2);CHKERRQ(ierr);
1181       idex   = pa[i];
1182       sbuf1_i = sbuf1[idex];
1183       jmax    = sbuf1_i[0];
1184       ct1     = 2*jmax+1;
1185       ct2     = 0;
1186       rbuf2_i = rbuf2[i];
1187       rbuf3_i = rbuf3[i];
1188       for (j=1; j<=jmax; j++) {
1189         is_no   = sbuf1_i[2*j-1];
1190         max1    = sbuf1_i[2*j];
1191         lens_i  = lens[is_no];
1192         cmap_i  = cmap[is_no];
1193         rmap_i  = rmap[is_no];
1194         for (k=0; k<max1; k++,ct1++) {
1195           row  = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
1196           max2 = rbuf2_i[ct1];
1197           for (l=0; l<max2; l++,ct2++) {
1198             if (cmap_i[rbuf3_i[ct2]]) {
1199               lens_i[row]++;
1200             }
1201           }
1202         }
1203       }
1204     }
1205   }
1206   ierr = PetscFree(r_status3);CHKERRQ(ierr);
1207   ierr = PetscFree(r_waits3);CHKERRQ(ierr);
1208   ierr = MPI_Waitall(nrqr,s_waits3,s_status3);CHKERRQ(ierr);
1209   ierr = PetscFree(s_status3);CHKERRQ(ierr);
1210   ierr = PetscFree(s_waits3);CHKERRQ(ierr);
1211 
1212   /* Create the submatrices */
1213   if (scall == MAT_REUSE_MATRIX) {
1214     PetscTruth flag;
1215 
1216     /*
1217         Assumes new rows are same length as the old rows,hence bug!
1218     */
1219     for (i=0; i<ismax; i++) {
1220       mat = (Mat_SeqAIJ *)(submats[i]->data);
1221       if ((submats[i]->m != nrow[i]) || (submats[i]->n != ncol[i])) {
1222         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1223       }
1224       ierr = PetscMemcmp(mat->ilen,lens[i],submats[i]->m*sizeof(int),&flag);CHKERRQ(ierr);
1225       if (flag == PETSC_FALSE) {
1226         SETERRQ(PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
1227       }
1228       /* Initial matrix as if empty */
1229       ierr = PetscMemzero(mat->ilen,submats[i]->m*sizeof(int));CHKERRQ(ierr);
1230       submats[i]->factor = C->factor;
1231     }
1232   } else {
1233     for (i=0; i<ismax; i++) {
1234       ierr = MatCreate(PETSC_COMM_SELF,nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE,submats+i);CHKERRQ(ierr);
1235       ierr = MatSetType(submats[i],A->type_name);CHKERRQ(ierr);
1236       ierr = MatSeqAIJSetPreallocation(submats[i],0,lens[i]);CHKERRQ(ierr);
1237     }
1238   }
1239 
1240   /* Assemble the matrices */
1241   /* First assemble the local rows */
1242   {
1243     int    ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
1244     PetscScalar *imat_a;
1245 
1246     for (i=0; i<ismax; i++) {
1247       mat       = (Mat_SeqAIJ*)submats[i]->data;
1248       imat_ilen = mat->ilen;
1249       imat_j    = mat->j;
1250       imat_i    = mat->i;
1251       imat_a    = mat->a;
1252       cmap_i    = cmap[i];
1253       rmap_i    = rmap[i];
1254       irow_i    = irow[i];
1255       jmax      = nrow[i];
1256       for (j=0; j<jmax; j++) {
1257         row      = irow_i[j];
1258         proc     = rtable[row];
1259         if (proc == rank) {
1260           old_row  = row;
1261           row      = rmap_i[row];
1262           ilen_row = imat_ilen[row];
1263           ierr     = MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1264           mat_i    = imat_i[row] ;
1265           mat_a    = imat_a + mat_i;
1266           mat_j    = imat_j + mat_i;
1267           for (k=0; k<ncols; k++) {
1268             if ((tcol = cmap_i[cols[k]])) {
1269               *mat_j++ = tcol - 1;
1270               *mat_a++ = vals[k];
1271               ilen_row++;
1272             }
1273           }
1274           ierr = MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);CHKERRQ(ierr);
1275           imat_ilen[row] = ilen_row;
1276         }
1277       }
1278     }
1279   }
1280 
1281   /*   Now assemble the off proc rows*/
1282   {
1283     int    *sbuf1_i,*rbuf2_i,*rbuf3_i,*imat_ilen,ilen;
1284     int    *imat_j,*imat_i;
1285     PetscScalar *imat_a,*rbuf4_i;
1286 
1287     for (tmp2=0; tmp2<nrqs; tmp2++) {
1288       ierr = MPI_Waitany(nrqs,r_waits4,&i,r_status4+tmp2);CHKERRQ(ierr);
1289       idex   = pa[i];
1290       sbuf1_i = sbuf1[idex];
1291       jmax    = sbuf1_i[0];
1292       ct1     = 2*jmax + 1;
1293       ct2     = 0;
1294       rbuf2_i = rbuf2[i];
1295       rbuf3_i = rbuf3[i];
1296       rbuf4_i = rbuf4[i];
1297       for (j=1; j<=jmax; j++) {
1298         is_no     = sbuf1_i[2*j-1];
1299         rmap_i    = rmap[is_no];
1300         cmap_i    = cmap[is_no];
1301         mat       = (Mat_SeqAIJ*)submats[is_no]->data;
1302         imat_ilen = mat->ilen;
1303         imat_j    = mat->j;
1304         imat_i    = mat->i;
1305         imat_a    = mat->a;
1306         max1      = sbuf1_i[2*j];
1307         for (k=0; k<max1; k++,ct1++) {
1308           row   = sbuf1_i[ct1];
1309           row   = rmap_i[row];
1310           ilen  = imat_ilen[row];
1311           mat_i = imat_i[row] ;
1312           mat_a = imat_a + mat_i;
1313           mat_j = imat_j + mat_i;
1314           max2 = rbuf2_i[ct1];
1315           for (l=0; l<max2; l++,ct2++) {
1316             if ((tcol = cmap_i[rbuf3_i[ct2]])) {
1317               *mat_j++ = tcol - 1;
1318               *mat_a++ = rbuf4_i[ct2];
1319               ilen++;
1320             }
1321           }
1322           imat_ilen[row] = ilen;
1323         }
1324       }
1325     }
1326   }
1327   ierr = PetscFree(r_status4);CHKERRQ(ierr);
1328   ierr = PetscFree(r_waits4);CHKERRQ(ierr);
1329   ierr = MPI_Waitall(nrqr,s_waits4,s_status4);CHKERRQ(ierr);
1330   ierr = PetscFree(s_waits4);CHKERRQ(ierr);
1331   ierr = PetscFree(s_status4);CHKERRQ(ierr);
1332 
1333   /* Restore the indices */
1334   for (i=0; i<ismax; i++) {
1335     ierr = ISRestoreIndices(isrow[i],irow+i);CHKERRQ(ierr);
1336     ierr = ISRestoreIndices(iscol[i],icol+i);CHKERRQ(ierr);
1337   }
1338 
1339   /* Destroy allocated memory */
1340   ierr = PetscFree(irow);CHKERRQ(ierr);
1341   ierr = PetscFree(w1);CHKERRQ(ierr);
1342   ierr = PetscFree(pa);CHKERRQ(ierr);
1343 
1344   ierr = PetscFree(sbuf1);CHKERRQ(ierr);
1345   ierr = PetscFree(rbuf2);CHKERRQ(ierr);
1346   for (i=0; i<nrqr; ++i) {
1347     ierr = PetscFree(sbuf2[i]);CHKERRQ(ierr);
1348   }
1349   for (i=0; i<nrqs; ++i) {
1350     ierr = PetscFree(rbuf3[i]);CHKERRQ(ierr);
1351     ierr = PetscFree(rbuf4[i]);CHKERRQ(ierr);
1352   }
1353 
1354   ierr = PetscFree(sbuf2);CHKERRQ(ierr);
1355   ierr = PetscFree(rbuf3);CHKERRQ(ierr);
1356   ierr = PetscFree(rbuf4);CHKERRQ(ierr);
1357   ierr = PetscFree(sbuf_aj[0]);CHKERRQ(ierr);
1358   ierr = PetscFree(sbuf_aj);CHKERRQ(ierr);
1359   ierr = PetscFree(sbuf_aa[0]);CHKERRQ(ierr);
1360   ierr = PetscFree(sbuf_aa);CHKERRQ(ierr);
1361 
1362   ierr = PetscFree(cmap);CHKERRQ(ierr);
1363   ierr = PetscFree(rmap);CHKERRQ(ierr);
1364   ierr = PetscFree(lens);CHKERRQ(ierr);
1365 
1366   for (i=0; i<ismax; i++) {
1367     ierr = MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1368     ierr = MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
1369   }
1370   PetscFunctionReturn(0);
1371 }
1372 
1373