xref: /petsc/src/mat/impls/aij/mpi/mpiov.c (revision 8508945acccfa812e1e92b793a51e65c8d708d52) !
1 #ifndef lint
2 static char vcid[] = "$Id: mpiov.c,v 1.4 1996/01/23 20:13:36 balay Exp balay $";
3 #endif
4 
5 #include "mpiaij.h"
6 int MatIncreaseOverlap_MPIAIJ_private(Mat, int, IS *);
7 
8 int MatIncreaseOverlap_MPIAIJ(Mat C, int is_max, IS *is, int ov)
9 {
10   int i, ierr;
11   if (ov < 1){ SETERRQ(1," MatIncreaseOverlap_MPIAIJ: overlap should be atleast 1\n");}
12   for (i =1; i<=ov; ++i) {
13     ierr = MatIncreaseOverlap_MPIAIJ_private(C, is_max, is); CHKERRQ(ierr);
14   }
15   return 0;
16 }
17 
18 
19 
20 int MatIncreaseOverlap_MPIAIJ_private(Mat C, int is_max, IS *is)
21 {
22   Mat_MPIAIJ  *c = (Mat_MPIAIJ *) C->data;
23   Mat          A = c->A, B = c->B;
24   Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
25   int         **idx, *n, *w1, *w2, *w3, *w4, *rtable,**table,**data;
26   int         size, rank, m,i,j,k, ierr, **rbuf, row, proc, mct, msz, **outdat, **ptr;
27   int         *ctr, sum, *pa, tag, *tmp,bsz, nmsg , *isz, *isz1, **xdata, rstart;
28   int         cstart, *ai, *aj, *bi, *bj, *garray, bsz1, **rbuf2;
29   MPI_Comm    comm;
30   MPI_Request *send_waits,*recv_waits,*send_waits2,*recv_waits2 ;
31   MPI_Status  *send_status ,*recv_status;
32                                 /* assume overlap = 1 */
33   comm   = C->comm;
34   tag    = C->tag;
35   size   = c->size;
36   rank   = c->rank;
37   m      = c->M;
38   rstart = c->rstart;
39   cstart = c->cstart;
40   ai     = a->i;
41   aj     = a->j;
42   bi     = b->i;
43   bj     = b->j;
44   garray = c->garray;
45 
46   idx    = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(idx);
47   n      = (int *)PetscMalloc((is_max+1)*sizeof(int )); CHKPTRQ(n);
48   rtable = (int *)PetscMalloc((m+1)*sizeof(int )); CHKPTRQ(rtable);
49                                 /* Hash table for maping row ->proc */
50 
51   for ( i=0 ; i<is_max ; i++) {
52     ierr = ISGetIndices(is[i],&idx[i]);  CHKERRQ(ierr);
53     ierr = ISGetLocalSize(is[i],&n[i]);  CHKERRQ(ierr);
54   }
55 
56   /* Create hash table for the mapping :row -> proc*/
57   for( i=0, j=0; i< size; i++) {
58     for (; j <c->rowners[i+1]; j++) {
59       rtable[j] = i;
60     }
61   }
62 
63   /* evaluate communication - mesg to who, length of mesg, and buffer space
64      required. Based on this, buffers are allocated, and data copied into them*/
65   w1     = (int *)PetscMalloc((size)*4*sizeof(int )); CHKPTRQ(w1); /*  mesg size */
66   w2     = w1 + size;         /* if w2[i] marked, then a message to proc i*/
67   w3     = w2 + size;         /* no of IS that needs to be sent to proc i */
68   w4     = w3 + size;         /* temp work space used in determining w1, w2, w3 */
69   PetscMemzero(w1,(size)*3*sizeof(int)); /* initialise work vector*/
70   for ( i=0;  i<is_max ; i++) {
71     PetscMemzero(w4,(size)*sizeof(int)); /* initialise work vector*/
72     for ( j =0 ; j < n[i] ; j++) {
73       row  = idx[i][j];
74       proc = rtable[row];
75       w4[proc]++;
76     }
77     for( j = 0; j < size; j++){
78       if( w4[j] ) { w1[j] += w4[j];  w3[j] += 1;}
79     }
80   }
81 
82   mct      = 0;              /* no of outgoing messages */
83   msz      = 0;              /* total mesg length (for all proc */
84   w1[rank] = 0;              /* no mesg sent to intself */
85   w3[rank] = 0;
86   for (i =0; i < size ; i++) {
87     if (w1[i])  { w2[i] = 1; mct++;} /* there exists a message to proc i */
88   }
89   pa = (int *)PetscMalloc((mct +1)*sizeof(int)); CHKPTRQ(pa); /* (proc -array) */
90   for (i =0, j=0; i < size ; i++) {
91     if (w1[i]) { pa[j] = i; j++; }
92   }
93 
94   /* Each message would have a header = 1 + no of IS + data */
95   for (i = 0; i<mct ; i++) {
96     j = pa[i];
97     w1[j] += w2[j] + w3[j];
98     msz   += w1[j];
99   }
100 
101   /* Allocate Memory for outgoing messages */
102   outdat    = (int **)PetscMalloc( 2*size*sizeof(int*)); CHKPTRQ(outdat);
103   PetscMemzero(outdat,  2*size*sizeof(int*));
104   tmp       = (int *)PetscMalloc((msz+1) *sizeof (int)); CHKPTRQ(tmp); /*mrsg arr */
105   ptr       = outdat +size;     /* Pointers to the data in outgoing buffers */
106   ctr       = (int *)PetscMalloc( size*sizeof(int));   CHKPTRQ(ctr);
107 
108   {
109     int *iptr = tmp;
110     int ict  = 0;
111     for (i = 0; i < mct ; i++) {
112       j         = pa[i];
113       iptr     +=  ict;
114       outdat[j] = iptr;
115       ict       = w1[j];
116     }
117   }
118 
119   /* Form the outgoing messages */
120   /*plug in the headers*/
121   for ( i=0 ; i<mct ; i++) {
122     j = pa[i];
123     outdat[j][0] = 0;
124     PetscMemzero(outdat[j]+1, w3[j]*sizeof(int));
125     ptr[j] = outdat[j] + w3[j];
126   }
127 
128   /* Memory for doing local proc's work*/
129   table = (int **)PetscMalloc((is_max+1)*sizeof(int *));  CHKPTRQ(table);
130   data  = (int **)PetscMalloc((is_max+1)*sizeof(int *)); CHKPTRQ(data);
131   table[0] = (int *)PetscMalloc((m+1)*(is_max)*2*sizeof(int)); CHKPTRQ(table[0]);
132   data [0] = table[0] + (m+1)*(is_max);
133 
134   for(i = 1; i<is_max ; i++) {
135     table[i] = table[0] + (m+1)*i;
136     data[i]  = data[0] + (m+1)*i;
137   }
138 
139   PetscMemzero((void*)*table,(m+1)*(is_max)*sizeof(int));
140   isz = (int *)PetscMalloc((is_max+1) *sizeof(int)); CHKPTRQ(isz);
141   PetscMemzero((void *)isz,(is_max+1) *sizeof(int));
142 
143   /* Parse the IS and update local tables and the outgoing buf with the data*/
144   for ( i=0 ; i<is_max ; i++) {
145     PetscMemzero(ctr,size*sizeof(int));
146     for( j=0;  j<n[i]; j++) {  /* parse the indices of each IS */
147       row  = idx[i][j];
148       proc = rtable[row];
149       if (proc != rank) { /* copy to the outgoing buf*/
150         ctr[proc]++;
151         *ptr[proc] = row;
152         ptr[proc]++;
153       }
154       else { /* Update the table */
155         if(!table[i][row]++) { data[i][isz[i]++] = row;}
156       }
157     }
158 
159     /*    Update the headers*/
160     for( j = 0; j<size; j++) { /* Can Optimise this loop too */
161       if (ctr[j]) {
162         outdat[j][0] ++;
163         outdat[j][ outdat[j][0]] = ctr[j];
164       }
165     }
166   }
167 
168   /* Check Validity of the outgoing messages */
169   for ( i=0 ; i<mct ; i++) {
170     j = pa[i];
171     if (w3[j] != outdat[j][0]) {SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[1] mismatch!\n"); }
172   }
173 
174   for ( i=0 ; i<mct ; i++) {
175     j = pa[i];
176     sum = 1;
177     for (k = 1; k <= w3[j]; k++) sum += outdat[j][k]+1;
178     if (sum != w1[j]) { SETERRQ(1,"MatIncreaseOverlap_MPIAIJ: Blew it! Header[2-n] mismatch!  \n"); }
179   }
180 
181   MPIU_printf(MPI_COMM_SELF,"[%d]Whew!!! sending to %d nodes\n",rank, mct);
182 
183 
184   /* Do a global reduction to determine how many messages to expect*/
185   {
186     int *rw1, *rw2;
187     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
188     rw2 = rw1+size;
189     MPI_Allreduce((void *)w1, rw1, size, MPI_INT, MPI_MAX, comm);
190     bsz   = rw1[rank];
191     MPI_Allreduce((void *)w2, rw2, size, MPI_INT, MPI_SUM, comm);
192     nmsg  = rw2[rank];
193     PetscFree(rw1);
194   }
195 
196   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
197   rbuf    = (int**) PetscMalloc((nmsg+1) *sizeof(int*));  CHKPTRQ(rbuf);
198   rbuf[0] = (int *) PetscMalloc((nmsg *bsz+1) * sizeof(int));  CHKPTRQ(rbuf[0]);
199   for (i=1; i<nmsg ; ++i) rbuf[i] = rbuf[i-1] + bsz;
200 
201   /* Now post the receives */
202   recv_waits = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
203   CHKPTRQ(recv_waits);
204   for ( i=0; i<nmsg; ++i){
205     MPI_Irecv((void *)rbuf[i], bsz, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits+i);
206   }
207 
208   /*  Now  post the sends */
209   send_waits = (MPI_Request *) PetscMalloc((mct+1)*sizeof(MPI_Request));
210   CHKPTRQ(send_waits);
211   for( i =0; i< mct; ++i){
212     j = pa[i];
213     MPI_Isend( (void *)outdat[j], w1[j], MPI_INT, j, tag, comm, send_waits+i);
214   }
215 
216   MPIU_printf(MPI_COMM_SELF,"[%d]:) posted %d sends and %d recieves\n",rank, mct, nmsg);
217 
218 
219   /* Do Local work*/
220   /* Extract the matrices */
221   {
222     int  start, end, val, max;
223 
224     for( i=0; i<is_max; i++) {
225       for ( j=0, max =isz[i] ; j< max; j++) {
226         row   = data[i][j] - rstart;
227         start = ai[row];
228         end   = ai[row+1];
229         for ( k=start; k < end; k++) { /* Amat */
230           val = aj[k] +cstart;
231           if(!table[i][val]++) { data[i][isz[i]++] = val;}
232         }
233         start = bi[row];
234         end   = bi[row+1];
235         for ( k=start; k < end; k++) { /* Bmat */
236           val = garray[bj[k]] ;
237           if(!table[i][val]++) { data[i][isz[i]++] = val;}
238         }
239       }
240     }
241   }
242   /* Receive messages*/
243   {
244     int        index;
245 
246     recv_status = (MPI_Status *) PetscMalloc( (nmsg+1)*sizeof(MPI_Status) );
247     CHKPTRQ(recv_status);
248     for ( i=0; i< nmsg; ++i) {
249       MPI_Waitany(nmsg, recv_waits, &index, recv_status+i);
250     }
251 
252     send_status = (MPI_Status *) PetscMalloc( (mct+1)*sizeof(MPI_Status) );
253     CHKPTRQ(send_status);
254     MPI_Waitall(mct,send_waits,send_status);
255   }
256 
257 
258   /* Do work on recieved messages*/
259   {
260     int ct, ct1, ct2, *xtable;
261     int oct2, l, start, end, val, max1, max2;
262 
263 
264     for (i =0, ct =0; i< nmsg; ++i) ct+= rbuf[i][0];
265 
266     xdata   = (int **)PetscMalloc((nmsg+1)*sizeof(int *)); CHKPTRQ(xdata);
267     xtable  = (int *)PetscMalloc((ct+nmsg+(m+1)*(ct+1))*sizeof(int)); CHKPTRQ(xtable);
268     xdata [0] = xtable + (m+1);
269     isz1 = (int *)PetscMalloc((nmsg+1) *sizeof(int)); CHKPTRQ(isz1);
270     PetscMemzero((void *)isz1,(nmsg+1) *sizeof(int));
271 
272 
273     for (i =0; i< nmsg; i++) { /* for easch mesg from proc i */
274       ct1 = rbuf[i][0]+1;
275       ct2 = rbuf[i][0]+1;
276       for (j = 0, max1= rbuf[i][0]; j<max1; j++) { /* for each IS from proc i*/
277         PetscMemzero((void *)xtable,(m+1)*sizeof(int));
278         oct2 = ct2;
279         for (k =0; k < rbuf[i][j]+1; k++, ct1++) {
280           row = rbuf[i][ct1];
281           if(!xtable[row]++) { xdata[i][ct2++] = row;}
282         }
283         for ( k=oct2, max2 =ct2 ; k< max2; k++) {
284           row   = xdata[i][k] - rstart;
285           start = ai[row];
286           end   = ai[row+1];
287           for ( l=start; l < end; l++) {
288             val = aj[l] +cstart;
289             if(!xtable[val]++) { xdata[i][ct2++] = val;}
290           }
291           start = bi[row];
292           end   = bi[row+1];
293           for ( l=start; l < end; l++) {
294             val = garray[bj[l]] ;
295             if(!xtable[val]++) { xdata[i][ct2++] = val;}
296           }
297         }
298         /* Update the header*/
299         xdata[i][j] = ct2-oct2; /* Undo the vector isz1 and use only a var*/
300       }
301       xdata[i][0] = rbuf[i][0];
302       xdata[i+1]  = xdata[i] +ct2;
303       isz1[i]     = ct2; /* size of each message */
304     }
305   }
306   /* need isz, xdata;*/
307 
308   /* Send the data back*/
309   /* Do a global reduction to know the buffer space req for incoming messages*/
310   {
311     int *rw1, *rw2;
312 
313     rw1 = (int *)PetscMalloc(2*size*sizeof(int)); CHKPTRQ(rw1);
314     PetscMemzero((void*)rw1,2*size*sizeof(int));
315     rw2 = rw1+size;
316     for (i =0; i < nmsg ; ++i) {
317       proc      = recv_status[i].MPI_SOURCE;
318       rw1[proc] = isz1[i];
319     }
320 
321     MPI_Allreduce((void *)rw1, (void *)rw2, size, MPI_INT, MPI_MAX, comm);
322     bsz1   = rw2[rank];
323     PetscFree(rw1);
324   }
325 
326   /* Allocate buffers*/
327 
328   /* Allocate memory for recv buffers . Prob none if nmsg = 0 ???? */
329   rbuf2    = (int**) PetscMalloc((mct+1) *sizeof(int*));  CHKPTRQ(rbuf2);
330   rbuf2[0] = (int *) PetscMalloc((mct*bsz1+1) * sizeof(int));  CHKPTRQ(rbuf2[0]);
331   for (i=1; i<mct ; ++i) rbuf2[i] = rbuf2[i-1] + bsz1;
332 
333   /* Now post the receives */
334   recv_waits2 = (MPI_Request *)PetscMalloc((mct+1)*sizeof(MPI_Request)); CHKPTRQ(recv_waits2)
335   CHKPTRQ(recv_waits2);
336   for ( i=0; i<mct; ++i){
337     MPI_Irecv((void *)rbuf2[i], bsz1, MPI_INT, MPI_ANY_SOURCE, tag, comm, recv_waits2+i);
338   }
339 
340   /*  Now  post the sends */
341   send_waits2 = (MPI_Request *) PetscMalloc((nmsg+1)*sizeof(MPI_Request));
342   CHKPTRQ(send_waits2);
343   for( i =0; i< nmsg; ++i){
344     j = recv_status[i].MPI_SOURCE;
345     MPI_Isend( (void *)xdata[i], isz1[i], MPI_INT, j, tag, comm, send_waits2+i);
346   }
347 
348   MPIU_printf(MPI_COMM_SELF,"[%d]:):) posted %d sends and %d recieves\n",rank, mct, nmsg);
349 
350   /* recieve work done on other processors*/
351 
352   /* pack up*/
353 
354   /* done !!!! */
355 
356   /* whew! already done? check again :) */
357   PetscFree(idx);
358   PetscFree(n);
359   PetscFree(rtable);
360   PetscFree(w1);
361   PetscFree(tmp);
362   PetscFree(outdat);
363   PetscFree(ctr);
364   PetscFree(pa);
365   PetscFree(rbuf[0]);
366   PetscFree(rbuf);
367   PetscFree(send_waits);
368   PetscFree(recv_waits);
369   PetscFree(table[0]);
370   PetscFree(table);
371   PetscFree(data);
372   PetscFree(send_status);
373   /* Dont forget to ISRestoreIndices */
374   return 0;
375 }
376