xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 2b479c3630a9af790806813ea013d1ee4f5b1177)
1 #ifndef lint
2 static char vcid[] = "$Id: mpiov.c,v 1.17 1996/02/07 23:36:30 balay Exp balay $";
3 #endif
4 
5 #include "mpiaij.h"
6 #include "inline/bitarray.h"
7 
8 static int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *);
9 static int FindOverlapLocal(Mat , int , char **,int*, int**);
10 static int FindOverlapRecievedMesg(Mat , int, int **, int**, int* );
11 
12 int MatIncreaseOverlap_MPIAIJ(Mat C, int imax, IS *is, int ov)
13 {
14   int i, ierr;
15   if (ov < 0){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: negative overlap specified\n");}
16   for (i =0; i<ov; ++i) {
17     ierr = MatIncreaseOverlap_MPIAIJ_private(C, imax, is); CHKERRQ(ierr);
18   }
19   return 0;
20 }
21 
22 /*
23   Sample message format:
24   If a processor A wants processor B to process some elements corresponding
25   to index sets 1s[1], is[5]
26   mesg [0] = 2   ( no of index sets in the mesg)
27   -----------
28   mesg [1] = 1 => is[1]
29   mesg [2] = sizeof(is[1]);
30   -----------
31   mesg [5] = 5  => is[5]
32   mesg [6] = sizeof(is[5]);
33   -----------
34   mesg [7]
35   mesg [n]  datas[1]
36   -----------
37   mesg[n+1]
38   mesg[m]  data(is[5])
39   -----------
40 */
41 static int MatIncreaseOverlap_MPIAIJ_private(Mat C, int imax, IS *is)
42 {
43   Mat_MPIAIJ  *c = (Mat_MPIAIJ *) C->data;
44   int         **idx, *n, *w1, *w2, *w3, *w4, *rtable,**data;
45   int         size, rank, m,i,j,k, ierr, **rbuf, row, proc, mct, msz, **outdat, **ptr;
46   int         *ctr, *pa, tag, *tmp,bsz, nmsg , *isz, *isz1, **xdata;
47   int          bsz1, **rbuf2;
48   char        **table;
49   MPI_Comm    comm;
50   MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ;
51   MPI_Status  *send_status ,*recv_status;
52   double         space, fr, maxs;
53 
54   comm   = C->comm;
55   tag    = C->tag;
56   size   = c->size;
57   rank   = c->rank;
58   m      = c->M;
59 
60 
61   TrSpace( &space, &fr, &maxs );
62   /*  MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */
63 
64   idx    = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx);
65   n      = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n);
66   rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable);
67                                 /* Hash table for maping row ->proc */
68 
69   for ( i=0 ; i<imax ; i++) {
70     ierr = ISGetIndices(is[i],&idx[i]);  CHKERRQ(ierr);
71     ierr = ISGetLocalSize(is[i],&n[i]);  CHKERRQ(ierr);
72   }
73 
74   /* Create hash table for the mapping :row -> proc*/
75   for( i=0, j=0; i< size; i++) {
76     for (; j <c->rowners[i+1]; j++) {
77       rtable[j] = i;
78     }
79   }
80 
81   /* evaluate communication - mesg to who, length of mesg, and buffer space
82      required. Based on this, buffers are allocated, and data copied into them*/
83   w1     = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /*  mesg size */
84   w2     = w1 + size;         /* if w2[i] marked, then a message to proc i*/
85   w3     = w2 + size;         /* no of IS that needs to be sent to proc i */
86   w4     = w3 + size;         /* temp work space used in determining w1, w2, w3 */
87   PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/
88   for ( i=0;  i<imax ; i++) {
89     PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/
90     for ( j =0 ; j < n[i] ; j++) {
91       row  = idx[i][j];
92       proc = rtable[row];
93       w4[proc]++;
94     }
95     for( j = 0; j < size; j++){
96       if( w4[j] ) { w1[j] += w4[j];  w3[j] += 1;}
97     }
98   }
99 
100   mct      = 0;              /* no of outgoing messages */
101   msz      = 0;              /* total mesg length (for all proc */
102   w1[rank] = 0;              /* no mesg sent to intself */
103   w3[rank] = 0;
104   for (i =0; i < size ; i++) {
105     if (w1[i])  { w2[i] = 1; mct++;} /* there exists a message to proc i */
106   }
107   pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */
108   for (i =0, j=0; i < size ; i++) {
109     if (w1[i]) { pa[j] = i; j++; }
110   }
111 
112   /* Each message would have a header = 1 + 2*(no of IS) + data */
113   for (i = 0; i<mct ; i++) {
114     j = pa[i];
115     w1[j] += w2[j] + 2* w3[j];
116     msz   += w1[j];
117   }
118 
119 
120   /* Do a global reduction to determine how many messages to expect*/
121   {
122     int *rw1, *rw2;
123     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
124     rw2 = rw1+size;
125     MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm);
126     bsz   = rw1[rank];
127     MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm);
128     nmsg  = rw2[rank];
129     PetscFree(rw1);
130   }
131 
132   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
133   rbuf    = (int**) PetscMalloc((nmsg+1) *sizeof(int*));  CHKPTRQ(rbuf);
134   rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int));  CHKPTRQ(rbuf[0]);
135   for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz;
136 
137   /* Now post the receives */
138   recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
139   CHKPTRQ(recv_waits);
140   for ( i=0; i<nmsg; ++i){
141     MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i);
142   }
143 
144   /* Allocate Memory for outgoing messages */
145   outdat    = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat);
146   PetscMemzero(outdat,  2*size*sizeof(int*));
147   tmp       = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */
148   ptr       = outdat +size;     /* Pointers to the data in outgoing buffers */
149   ctr       = (int *)PetscMalloc( size*sizeof(int));   CHKPTRQ(ctr);
150 
151   {
152     int *iptr = tmp;
153     int ict  = 0;
154     for (i = 0; i < mct ; i++) {
155       j         = pa[i];
156       iptr     +=  ict;
157       outdat[j] = iptr;
158       ict       = w1[j];
159     }
160   }
161 
162   /* Form the outgoing messages */
163   /*plug in the headers*/
164   for ( i=0 ; i<mct ; i++) {
165     j = pa[i];
166     outdat[j][0] = 0;
167     PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int));
168     ptr[j] = outdat[j] + 2*w3[j] +1;
169   }
170 
171   /* Memory for doing local proc's work*/
172   table = (char **)PetscMalloc((imax+1)*sizeof(int *));  CHKPTRQ(table);
173   data  = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(data);
174   table[0] = (char *)PetscMalloc((m/BITSPERBYTE +1)*(imax)); CHKPTRQ(table[0]);
175   data [0] = (int *)PetscMalloc((m+1)*(imax)*sizeof(int)); CHKPTRQ(data[0]);
176 
177   for(i = 1; i<imax ; i++) {
178     table[i] = table[0] + (m/BITSPERBYTE+1)*i;
179     data[i]  = data[0] + (m+1)*i;
180   }
181 
182   PetscMemzero((void*)*table,(m/BITSPERBYTE+1)*(imax));
183   isz = (int *)PetscMalloc((imax+1) *sizeof(int)); CHKPTRQ(isz);
184   PetscMemzero((void *)isz,(imax+1) *sizeof(int));
185 
186   /* Parse the IS and update local tables and the outgoing buf with the data*/
187   for ( i=0 ; i<imax ; i++) {
188     PetscMemzero(ctr,size*sizeof(int));
189     for( j=0;  j<n[i]; j++) {  /* parse the indices of each IS */
190       row  = idx[i][j];
191       proc = rtable[row];
192       if (proc != rank) { /* copy to the outgoing buf*/
193         ctr[proc]++;
194         *ptr[proc] = row;
195         ptr[proc]++;
196       }
197       else { /* Update the table */
198         if ( !BT_LOOKUP(table[i],row)) { data[i][isz[i]++] = row;}
199       }
200     }
201     /* Update the headers for the current IS */
202     for( j = 0; j<size; j++) { /* Can Optimise this loop too */
203       if (ctr[j]) {
204         k= ++outdat[j][0];
205         outdat[j][2*k]   = ctr[j];
206         outdat[j][2*k-1] = i;
207       }
208     }
209   }
210 
211 
212 
213   /*  Now  post the sends */
214   send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
215   CHKPTRQ(send_waits);
216   for( i =0; i< mct; ++i){
217     j = pa[i];
218     MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i);
219   }
220 
221   /* I nolonger need the original indices*/
222   for( i=0; i< imax; ++i) {
223     ierr = ISRestoreIndices(is[i], idx+i); CHKERRQ(ierr);
224   }
225   PetscFree(idx);
226   PetscFree(n);
227   PetscFree(rtable);
228   for( i=0; i< imax; ++i) {
229     ierr = ISDestroy(is[i]); CHKERRQ(ierr);
230   }
231 
232   /* Do Local work*/
233   ierr = FindOverlapLocal(C, imax, table,isz, data); CHKERRQ(ierr);
234   /* Extract the matrices */
235 
236   /* Receive messages*/
237   {
238     int        index;
239 
240     recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
241     CHKPTRQ(recv_status);
242     for ( i=0; i< nmsg; ++i) {
243       MPI_Waitany(nmsg, recv_waits, &index, recv_status+i);
244     }
245 
246     send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
247     CHKPTRQ(send_status);
248     MPI_Waitall(mct,send_waits,send_status);
249   }
250   /* Pahse 1 sends are complete - deallocate buffers */
251   PetscFree(outdat);
252   PetscFree(w1);
253   PetscFree(tmp);
254 
255   /* int FindOverlapRecievedMesg(Mat C, int imax, int *isz, char **table, int **data)*/
256   xdata    = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(xdata);
257   isz1     = (int *)PetscMalloc((nmsg+1) *sizeof(int)); CHKPTRQ(isz1);
258   ierr = FindOverlapRecievedMesg(C, nmsg, rbuf,xdata,isz1); CHKERRQ(ierr);
259 
260   /* Nolonger need rbuf. */
261   PetscFree(rbuf[0]);
262   PetscFree(rbuf);
263 
264 
265   /* Send the data back*/
266   /* Do a global reduction to know the buffer space req for incoming messages*/
267   {
268     int *rw1, *rw2;
269 
270     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
271     PetscMemzero((void*)rw1,2*size*sizeof(int));
272     rw2 = rw1+size;
273     for (i =0; i < nmsg ; ++i) {
274       proc      = recv_status[i].MPI_SOURCE;
275       rw1[proc] = isz1[i];
276     }
277 
278     MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm);
279     bsz1   = rw2[rank];
280     PetscFree(rw1);
281   }
282 
283   /* Allocate buffers*/
284 
285   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
286   rbuf2    = (int**) PetscMalloc((mct+1) *sizeof(int*));  CHKPTRQ(rbuf2);
287   rbuf2[0] = (int *) PetscMalloc((mct*bsz1+1) * sizeof(int));  CHKPTRQ(rbuf2[0]);
288   for (i=1; i<mct ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1;
289 
290   /* Now post the receives */
291   recv_waits2 = (MPI_Request *)PetscMalloc((mct+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2)
292   CHKPTRQ(recv_waits2);
293   for ( i=0; i<mct; ++i){
294     MPI_Irecv((void *)rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i);
295   }
296 
297   /*  Now  post the sends */
298   send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
299   CHKPTRQ(send_waits2);
300   for( i =0; i< nmsg; ++i){
301     j = recv_status[i].MPI_SOURCE;
302     MPI_Isend( (void *)xdata[i], isz1[i], MPI_INT, j, tag, comm, send_waits2+i);
303   }
304 
305   /* recieve work done on other processors*/
306   {
307     int         index, is_no, ct1, max;
308     MPI_Status  *send_status2 ,*recv_status2;
309 
310     recv_status2 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
311     CHKPTRQ(recv_status2);
312 
313 
314     for ( i=0; i< mct; ++i) {
315       MPI_Waitany(mct, recv_waits2, &index, recv_status2+i);
316       /* Process the message*/
317       ct1 = 2*rbuf2[index][0]+1;
318       for (j=1; j<=rbuf2[index][0]; j++) {
319         max   = rbuf2[index][2*j];
320         is_no = rbuf2[index][2*j-1];
321         for (k=0; k < max ; k++, ct1++) {
322           row = rbuf2[index][ct1];
323           if(!BT_LOOKUP(table[is_no],row)) { data[is_no][isz[is_no]++] = row;}
324         }
325       }
326     }
327 
328 
329     send_status2 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
330     CHKPTRQ(send_status2);
331     MPI_Waitall(nmsg,send_waits2,send_status2);
332 
333     PetscFree(send_status2); PetscFree(recv_status2);
334   }
335 
336   TrSpace( &space, &fr, &maxs );
337   /*  MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs);*/
338 
339   PetscFree(ctr);
340   PetscFree(pa);
341   PetscFree(rbuf2[0]);
342   PetscFree(rbuf2);
343   PetscFree(send_waits);
344   PetscFree(recv_waits);
345   PetscFree(send_waits2);
346   PetscFree(recv_waits2);
347   PetscFree(table[0]);
348   PetscFree(table);
349   PetscFree(send_status);
350   PetscFree(recv_status);
351   PetscFree(isz1);
352   PetscFree(xdata[0]);
353   PetscFree(xdata);
354 
355   for ( i=0; i<imax; ++i) {
356     ierr = ISCreateSeq(MPI_COMM_SELF, isz[i], data[i], is+i); CHKERRQ(ierr);
357   }
358   PetscFree(isz);
359   PetscFree(data[0]);
360   PetscFree(data);
361 
362   return 0;
363 }
364 
365 /*   FindOverlapLocal() - Called by MatincreaseOverlap, to do the work on
366      the local processor.
367 
368      Inputs:
369       C      - MAT_MPIAIJ;
370       imax - total no of index sets processed at a time;
371       table  - an array of char - size = m bits.
372 
373      Output:
374       isz    - array containing the count of the solution elements correspondign
375                to each index set;
376       data   - pointer to the solutions
377 */
378 static int FindOverlapLocal(Mat C, int imax, char **table, int *isz,int **data)
379 {
380   Mat_MPIAIJ *c = (Mat_MPIAIJ *) C->data;
381   Mat        A = c->A, B = c->B;
382   Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
383   int        start, end, val, max, rstart,cstart, ashift, bshift,*ai, *aj;
384   int        *bi, *bj, *garray, i, j, k, row;
385 
386   rstart = c->rstart;
387   cstart = c->cstart;
388   ashift = a->indexshift;
389   ai     = a->i;
390   aj     = a->j +ashift;
391   bshift = b->indexshift;
392   bi     = b->i;
393   bj     = b->j +bshift;
394   garray = c->garray;
395 
396 
397   for( i=0; i<imax; i++) {
398     for ( j=0, max =isz[i] ; j< max; j++) {
399       row   = data[i][j] - rstart;
400       start = ai[row];
401       end   = ai[row+1];
402       for ( k=start; k < end; k++) { /* Amat */
403         val = aj[k] + ashift + cstart;
404         if(!BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;}
405       }
406       start = bi[row];
407       end   = bi[row+1];
408       for ( k=start; k < end; k++) { /* Bmat */
409         val = garray[bj[k]+bshift] ;
410         if(! BT_LOOKUP(table[i],val)) { data[i][isz[i]++] = val;}
411       }
412     }
413   }
414 
415 return 0;
416 }
417 /*       FindOverlapRecievedMesg - Process the recieved messages,
418          and return the output
419 
420          Input:
421            C    - the matrix
422            nmsg - no of messages being processed.
423            rbuf - an array of pointers to the recieved requests
424 
425          Output:
426            xdata - array of messages to be sent back
427            isz1  - size of each message
428 */
429 static int FindOverlapRecievedMesg(Mat C, int nmsg, int ** rbuf, int ** xdata, int * isz1 )
430 {
431   Mat_MPIAIJ  *c = (Mat_MPIAIJ *) C->data;
432   Mat         A = c->A, B = c->B;
433   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
434   int         rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj, *garray, i, j, k;
435   int         row,total_sz,ct, ct1, ct2, ct3,mem_estimate, oct2, l, start, end;
436   int         val, max1, max2, rank, m, no_malloc =0, *tmp, new_estimate, ctr;
437   char        *xtable;
438 
439   rank   = c->rank;
440   m      = c->M;
441   rstart = c->rstart;
442   cstart = c->cstart;
443   ashift = a->indexshift;
444   ai     = a->i;
445   aj     = a->j +ashift;
446   bshift = b->indexshift;
447   bi     = b->i;
448   bj     = b->j +bshift;
449   garray = c->garray;
450 
451 
452   for (i =0, ct =0, total_sz =0; i< nmsg; ++i){
453     ct+= rbuf[i][0];
454     for ( j = 1; j <= rbuf[i][0] ; j++ ) { total_sz += rbuf[i][2*j]; }
455     }
456 
457   max1 = ct*(a->nz +b->nz)/c->m;
458   mem_estimate =  3*((total_sz > max1?total_sz:max1)+1);
459   xdata[0] = (int *)PetscMalloc(mem_estimate *sizeof(int)); CHKPTRQ(xdata[0]);
460   ++no_malloc;
461   xtable   = (char *)PetscMalloc((m/BITSPERBYTE+1)); CHKPTRQ(xtable);
462   PetscMemzero((void *)isz1,(nmsg+1) *sizeof(int));
463 
464   ct3 = 0;
465   for (i =0; i< nmsg; i++) { /* for easch mesg from proc i */
466     ct1 = 2*rbuf[i][0]+1;
467     ct2 = ct1;
468     ct3+= ct1;
469     for (j = 1, max1= rbuf[i][0]; j<=max1; j++) { /* for each IS from proc i*/
470       PetscMemzero((void *)xtable,(m/BITSPERBYTE+1));
471       oct2 = ct2;
472       for (k =0; k < rbuf[i][2*j]; k++, ct1++) {
473         row = rbuf[i][ct1];
474         if(!BT_LOOKUP(xtable,row)) {
475           if (!(ct3 < mem_estimate)) {
476             new_estimate = (int)1.5*mem_estimate+1;
477             tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp);
478             PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int));
479             PetscFree(xdata[0]);
480             xdata[0] = tmp;
481             mem_estimate = new_estimate; ++no_malloc;
482             for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
483           }
484            xdata[i][ct2++] = row;ct3++;
485         }
486       }
487       for ( k=oct2, max2 =ct2 ; k< max2; k++) {
488         row   = xdata[i][k] - rstart;
489         start = ai[row];
490         end   = ai[row+1];
491         for ( l=start; l < end; l++) {
492           val = aj[l] +ashift + cstart;
493           if(!BT_LOOKUP(xtable,val)) {
494             if (!(ct3 < mem_estimate)) {
495               new_estimate = (int)1.5*mem_estimate+1;
496               tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp);
497               PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int));
498               PetscFree(xdata[0]);
499               xdata[0] = tmp;
500               mem_estimate = new_estimate; ++no_malloc;
501               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
502             }
503             xdata[i][ct2++] = val;ct3++;
504           }
505         }
506         start = bi[row];
507         end   = bi[row+1];
508         for ( l=start; l < end; l++) {
509           val = garray[bj[l]+bshift] ;
510           if(!BT_LOOKUP(xtable,val)) {
511             if (!(ct3 < mem_estimate)) {
512               new_estimate = (int)1.5*mem_estimate+1;
513               tmp = (int*) PetscMalloc(new_estimate * sizeof(int)); CHKPTRQ(tmp);
514               PetscMemcpy((char *)tmp,(char *)xdata[0],mem_estimate*sizeof(int));
515               PetscFree(xdata[0]);
516               xdata[0] = tmp;
517               mem_estimate = new_estimate; ++no_malloc;
518               for (ctr =1; ctr <=i; ctr++) { xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];}
519             }
520             xdata[i][ct2++] = val;ct3++;
521           }
522         }
523       }
524       /* Update the header*/
525       xdata[i][2*j]   = ct2-oct2; /* Undo the vector isz1 and use only a var*/
526       xdata[i][2*j-1] = rbuf[i][2*j-1];
527     }
528     xdata[i][0] = rbuf[i][0];
529     xdata[i+1]  = xdata[i] +ct2;
530     isz1[i]     = ct2; /* size of each message */
531   }
532   PetscFree(xtable);
533   PLogInfo(0,"MatIncreaseOverlap_MPIAIJ:[%d] Allocated %d bytes, required %d bytes, no of mallocs = %d\n",rank,mem_estimate, ct3,no_malloc);
534   return 0;
535 }
536 
537 
538 int MatGetSubMatrices_MPIAIJ (Mat C,int imax, IS *irow,IS *icol,MatGetSubMatrixCall
539                               scall, Mat **submat)
540 {
541   Mat_MPIAIJ  *c = (Mat_MPIAIJ *) C->data;
542   Mat         A = c->A, B = c->B;
543   Mat_SeqAIJ  *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
544   int         **idx, *n, *w1, *w2, *w3, *w4, *rtable, start, end, size,**outdat, rank;
545   int         m,i,j,k,l,ct1,ct2, oct2, ierr, **rbuf, row, proc, mct, msz, **ptr, index;
546   int         *req_size, *ctr, *pa, tag, *tmp,bsz, nmsg , **rbuf2,*req_source,**sbuf_aj;
547   int          rstart,cstart, ashift, bshift,*ai, *aj, *bi, *bj,*garray,*rbufsize, max1;
548   MPI_Request *send_waits, *recv_waits, *send_waits2, *recv_waits2, *recv_waits3 ;
549   MPI_Request *recv_waits4,*send_waits3,*send_waits4;
550   MPI_Status  *recv_status ,*recv_status2,*send_status,*send_status3 ,*send_status2;
551   MPI_Status  *recv_status3,*recv_status4,*send_status4;
552   MPI_Comm    comm;
553   Scalar      **rbuf3, *aa, *ba, **sbuf_aa;
554   double      space, fr, maxs;
555 
556 
557   comm   = C->comm;
558   tag    = C->tag;
559   size   = c->size;
560   rank   = c->rank;
561   m      = c->M;
562   rstart = c->rstart;
563   cstart = c->cstart;
564   ashift = a->indexshift;
565   ai     = a->i;
566   aj     = a->j;
567   aa     = a->a;
568   bshift = b->indexshift;
569   bi     = b->i;
570   bj     = b->j;
571   ba     = b->a;
572   garray = c->garray;
573 
574   TrSpace( &space, &fr, &maxs );
575   /*  MPIU_printf(MPI_COMM_SELF,"[%d] allocated space = %f fragments = %f max ever allocated = %f\n", rank, space, fr, maxs); */
576 
577   idx    = (int **)PetscMalloc((imax+1)*sizeof(int *)); CHKPTRQ(idx);
578   n      = (int *)PetscMalloc((imax+1)*sizeof(int )); CHKPTRQ(n);
579   rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable);
580                                 /* Hash table for maping row ->proc */
581 
582   for ( i=0 ; i<imax ; i++) {
583     ierr = ISGetIndices(irow[i],&idx[i]);  CHKERRQ(ierr);
584     ierr = ISGetLocalSize(irow[i],&n[i]);  CHKERRQ(ierr);
585   }
586 
587   /* Create hash table for the mapping :row -> proc*/
588   for( i=0, j=0; i< size; i++) {
589     for (; j <c->rowners[i+1]; j++) {
590       rtable[j] = i;
591     }
592   }
593 
594   /* evaluate communication - mesg to who, length of mesg, and buffer space
595      required. Based on this, buffers are allocated, and data copied into them*/
596   w1     = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /*  mesg size */
597   w2     = w1 + size;         /* if w2[i] marked, then a message to proc i*/
598   w3     = w2 + size;         /* no of IS that needs to be sent to proc i */
599   w4     = w3 + size;         /* temp work space used in determining w1, w2, w3 */
600   PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/
601   for ( i=0;  i<imax ; i++) {
602     PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/
603     for ( j =0 ; j < n[i] ; j++) {
604       row  = idx[i][j];
605       proc = rtable[row];
606       w4[proc]++;
607     }
608     for( j = 0; j < size; j++){
609       if( w4[j] ) { w1[j] += w4[j];  w3[j] += 1;}
610     }
611   }
612 
613   mct      = 0;              /* no of outgoing messages */
614   msz      = 0;              /* total mesg length (for all proc */
615   w1[rank] = 0;              /* no mesg sent to intself */
616   w3[rank] = 0;
617   for (i =0; i < size ; i++) {
618     if (w1[i])  { w2[i] = 1; mct++;} /* there exists a message to proc i */
619   }
620   pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */
621   for (i =0, j=0; i < size ; i++) {
622     if (w1[i]) { pa[j] = i; j++; }
623   }
624 
625   /* Each message would have a header = 1 + 2*(no of IS) + data */
626   for (i = 0; i<mct ; i++) {
627     j = pa[i];
628     w1[j] += w2[j] + 2* w3[j];
629     msz   += w1[j];
630   }
631 
632 
633   /* Do a global reduction to determine how many messages to expect*/
634   {
635     int *rw1, *rw2;
636     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
637     rw2 = rw1+size;
638     MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm);
639     bsz   = rw1[rank];
640     MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm);
641     nmsg  = rw2[rank];
642     PetscFree(rw1);
643   }
644 
645   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
646   rbuf    = (int**) PetscMalloc((nmsg+1) *sizeof(int*));  CHKPTRQ(rbuf);
647   rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int));  CHKPTRQ(rbuf[0]);
648   for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz;
649 
650   /* Now post the receives */
651   recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
652   CHKPTRQ(recv_waits);
653   for ( i=0; i<nmsg; ++i){
654     MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i);
655   }
656 
657   /* Allocate Memory for outgoing messages */
658   outdat    = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat);
659   PetscMemzero(outdat,  2*size*sizeof(int*));
660   tmp       = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */
661   ptr       = outdat +size;     /* Pointers to the data in outgoing buffers */
662   ctr       = (int *)PetscMalloc( size*sizeof(int));   CHKPTRQ(ctr);
663 
664   {
665     int *iptr = tmp;
666     int ict  = 0;
667     for (i = 0; i < mct ; i++) {
668       j         = pa[i];
669       iptr     +=  ict;
670       outdat[j] = iptr;
671       ict       = w1[j];
672     }
673   }
674 
675   /* Form the outgoing messages */
676   /* Initialise the header space */
677   for ( i=0 ; i<mct ; i++) {
678     j = pa[i];
679     outdat[j][0] = 0;
680     PetscMemzero(outdat[j]+1, 2 * w3[j]*sizeof(int));
681     ptr[j] = outdat[j] + 2*w3[j] +1;
682   }
683 
684 
685   /* Parse the irow and copy data into outbuf */
686   for ( i=0 ; i<imax ; i++) {
687     PetscMemzero(ctr,size*sizeof(int));
688     for( j=0;  j<n[i]; j++) {  /* parse the indices of each IS */
689       row  = idx[i][j];
690       proc = rtable[row];
691       if (proc != rank) { /* copy to the outgoing buf*/
692         ctr[proc]++;
693         *ptr[proc] = row;
694         ptr[proc]++;
695       }
696     }
697     /* Update the headers for the current IS */
698     for( j = 0; j<size; j++) { /* Can Optimise this loop too */
699       if (ctr[j]) {
700         k= ++outdat[j][0];
701         outdat[j][2*k]   = ctr[j];
702         outdat[j][2*k-1] = i;
703       }
704     }
705   }
706 
707   /*  Now  post the sends */
708   send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
709   CHKPTRQ(send_waits);
710   for( i =0; i< mct; ++i){
711     j = pa[i];
712     MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i);
713   }
714 
715   /* Post Recieves to capture the buffer size */
716   recv_waits2 = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
717   CHKPTRQ(recv_waits2);
718   rbufsize = (int*)PetscMalloc((mct+1) *sizeof(int)); CHKPTRQ(rbufsize);
719   for( i =0; i< mct; ++i){
720     j = pa[i];
721     MPI_Irecv( (void *)(rbufsize+i), 1, MPI_INT, j, tag+1, comm, recv_waits2+i);
722   }
723 
724   /* Send to other procs the buf size they should allocate */
725 
726 
727   /* Receive messages*/
728   send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
729   CHKPTRQ(send_waits2);
730   recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
731   CHKPTRQ(recv_status);
732   req_size    = (int *) PetscMalloc( (nmsg +1) * sizeof(int)) ; CHKPTRQ(req_size);
733   req_source  = (int *) PetscMalloc( (nmsg +1) * sizeof(int)) ; CHKPTRQ(req_source);
734 
735   for ( i=0; i< nmsg; ++i) {
736     MPI_Waitany(nmsg, recv_waits, &index, recv_status+i);
737     /* Reply with the size of buffer required */
738     req_size[index] = 2*rbuf[index][0];
739     start           = 2*rbuf[index][0] + 1 ;
740     MPI_Get_count(recv_status+i,MPI_INT, &end);
741     for (j=start; j< end; j++) {
742       row       = rbuf[index][j] - rstart;
743       req_size[index] += ai[row+1] - ai[row];
744       req_size[index] += bi[row+1] - bi[row];
745     }
746     req_source[index] = recv_status[i].MPI_SOURCE;
747     printf("[%d] Send to %d: size %d, index = %d start = %d end = %d \n", rank,
748           req_source[index] , *(req_size+index), index, start, end);
749     MPI_Isend((void *)(req_size+index),1,MPI_INT,req_source[index],tag+1, comm, send_waits2+i);
750 
751   }
752 
753 
754   /*  recv buffer sizes */
755  /* Receive messages*/
756 
757   rbuf2 = (int**)PetscMalloc((mct+1) *sizeof(int*)); CHKPTRQ(rbuf2);
758   rbuf3 = (Scalar**)PetscMalloc((mct+1) *sizeof(Scalar*)); CHKPTRQ(rbuf3);
759   recv_waits3 = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
760   CHKPTRQ(recv_waits3);
761   recv_waits4 = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
762   CHKPTRQ(recv_waits4);
763   recv_status2 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
764   CHKPTRQ(recv_status2);
765   for ( i=0; i< mct; ++i) {
766     MPI_Waitany(mct, recv_waits2, &index, recv_status2+i);
767 
768     rbuf2[index] = (int *)PetscMalloc(rbufsize[index]*sizeof(int));
769     CHKPTRQ(rbuf2[index]);
770     rbuf3[index] = (Scalar *)PetscMalloc(rbufsize[index]*sizeof(Scalar));
771     CHKPTRQ(rbuf3[index]);
772 
773     MPI_Irecv((void *)rbuf2[index],rbufsize[index], MPI_INT,
774               recv_status2[i].MPI_SOURCE, tag+2, comm, recv_waits3+index);
775     MPI_Irecv((void *)rbuf3[index],rbufsize[index], MPIU_SCALAR,
776               recv_status2[i].MPI_SOURCE, tag+3, comm, recv_waits4+index);
777 
778   }
779 
780   /* Wait on sends1 and sends2 */
781     send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
782     CHKPTRQ(send_status);
783     send_status2 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
784     CHKPTRQ(send_status2);
785 
786   MPI_Waitall(mct,send_waits,send_status);
787   MPI_Waitall(nmsg,send_waits2,send_status2);
788 
789 
790   /* Now allocate buffers for a->j, and send them off */
791   sbuf_aj = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(sbuf_aj);
792   for ( i=0, j =0; i< nmsg; i++) j += req_size[i];
793   sbuf_aj[0] = (int*) PetscMalloc((j+1)*sizeof(int)); CHKPTRQ(sbuf_aj[0]);
794   for  (i =1; i< nmsg; i++)  sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
795 
796   send_waits3 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
797   CHKPTRQ(send_waits3);
798   for (i=0; i<nmsg; i++) {
799     ct1 = 2*rbuf[i][0]+1;
800     ct2 = ct1;
801     for (j=1, max1 = rbuf[i][0]; j< max1; j++){
802       oct2 = ct2;
803       for( k=0; k< rbuf[i][2*j]; k++, ct1++) {
804         row   = rbuf[i][ct1] - rstart;
805         start = ai[row];
806         end   = ai[row+1];
807         for (l = start; l< end; l++) {
808           sbuf_aj[i][ct2++] = aj[l] + cstart;
809         }
810         start = bi[row];
811         end   = bi[row+1];
812         for (l = start; l< end; l++) {
813           sbuf_aj[i][ct2++] = garray[bj[l]];
814         }
815       }
816       /* update the header  */
817       sbuf_aj[i][2*j]   = ct2 - oct2;
818       sbuf_aj[i][2*j-1] = rbuf[i][2*j-1];
819     }
820     sbuf_aj[i][0] = rbuf[i][0];
821     MPI_Isend((void *)(sbuf_aj+i),req_size[i],MPI_INT,req_source[i],tag+2, comm, send_waits3+i);
822   }
823   recv_status3 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
824   CHKPTRQ(recv_status3);
825   send_status3 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
826   CHKPTRQ(send_status3);
827 
828 
829   MPI_Waitall(mct,recv_waits3,recv_status3);
830   MPI_Waitall(nmsg,send_waits3,send_status3);
831 
832 
833   /* Now allocate buffers for a->a, and send them off */
834   sbuf_aa = (Scalar **)PetscMalloc((nmsg+1)*sizeof(Scalar *)); CHKPTRQ(sbuf_aa);
835   for ( i=0, j =0; i< nmsg; i++) j += req_size[i];
836   sbuf_aa[0] = (Scalar*) PetscMalloc((j+1)*sizeof(Scalar)); CHKPTRQ(sbuf_aa[0]);
837   for  (i =1; i< nmsg; i++)  sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
838 
839   send_waits4 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
840   CHKPTRQ(send_waits4);
841   for (i=0; i<nmsg; i++) {
842     ct1 = 2*rbuf[i][0]+1;
843     ct2 = ct1;
844     for (j=1, max1 = rbuf[i][0]; j< max1; j++){
845       oct2 = ct2;
846       for( k=0; k< rbuf[i][2*j]; k++, ct1++) {
847         row   = rbuf[i][ct1] - rstart;
848         start = ai[row];
849         end   = ai[row+1];
850         for (l = start; l< end; l++) {
851           sbuf_aa[i][ct2++] = aa[l];
852         }
853         start = bi[row];
854         end   = bi[row+1];
855         for (l = start; l< end; l++) {
856           sbuf_aj[i][ct2++] = ba[l];
857         }
858       }
859       /* update the header  */
860       sbuf_aj[i][2*j]   = (Scalar)(ct2 - oct2);
861       sbuf_aj[i][2*j-1] = (Scalar)(rbuf[i][2*j-1]);
862     }
863     sbuf_aj[i][0] = (Scalar)rbuf[i][0];
864     MPI_Isend((void *)(sbuf_aa+i),req_size[i],MPIU_SCALAR,req_source[i],tag+3, comm, send_waits4+i);
865   }
866   recv_status4 = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
867   CHKPTRQ(recv_status4);
868   send_status4 = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
869   CHKPTRQ(send_status4);
870 
871 
872   MPI_Waitall(mct,recv_waits4,recv_status4);
873   MPI_Waitall(nmsg,send_waits4,send_status4);
874 
875 
876   /* Now u have all the data required, so form the matrix */
877   /* rbuf2->aj, rbuf3 -> aa */
878 
879 
880   return 0;
881 }
882 
883 
884 
885 
886