xref: /petsc/src/ksp/pc/impls/tfs/comm.c (revision 4c8c32f384357201972f9dd67b0f2e901704fd35)
1 
2 /***********************************comm.c*************************************
3 SPARSE GATHER-SCATTER PACKAGE: bss_malloc bss_malloc ivec error comm gs queue
4 
5 Author: Henry M. Tufo III
6 
7 e-mail: hmt@cs.brown.edu
8 
9 snail-mail:
10 Division of Applied Mathematics
11 Brown University
12 Providence, RI 02912
13 
14 Last Modification:
15 11.21.97
16 ***********************************comm.c*************************************/
17 
18 /***********************************comm.c*************************************
19 File Description:
20 -----------------
21 
22 ***********************************comm.c*************************************/
23 #include <stdio.h>
24 #include "petsc.h"
25 
26 #if   defined NXSRC
27 #ifndef DELTA
28 #include <nx.h>
29 #endif
30 
31 #elif defined MPISRC
32 #include <mpi.h>
33 
34 #endif
35 
36 
37 #include "const.h"
38 #include "types.h"
39 #include "error.h"
40 #include "ivec.h"
41 #include "bss_malloc.h"
42 #include "comm.h"
43 
44 
45 /* global program control variables - explicitly exported */
46 int my_id            = 0;
47 int num_nodes        = 1;
48 int floor_num_nodes  = 0;
49 int i_log2_num_nodes = 0;
50 
51 /* global program control variables */
52 static int p_init = 0;
53 static int modfl_num_nodes;
54 static int edge_not_pow_2;
55 
56 static unsigned int edge_node[INT_LEN*32];
57 
58 static void sgl_iadd(int    *vals, int level);
59 static void sgl_radd(double *vals, int level);
60 static void hmt_concat(REAL *vals, REAL *work, int size);
61 
62 
63 /***********************************comm.c*************************************
64 Function: giop()
65 
66 Input :
67 Output:
68 Return:
69 Description:
70 ***********************************comm.c*************************************/
71 void
72 comm_init (void)
73 {
74 #ifdef DEBUG
75   error_msg_warning("c_init() :: start\n");
76 #endif
77 
78   if (p_init++) return;
79 
80 #if PETSC_SIZEOF_INT != 4
81   error_msg_warning("Int != 4 Bytes!");
82 #endif
83 
84 
85   MPI_Comm_size(MPI_COMM_WORLD,&num_nodes);
86   MPI_Comm_rank(MPI_COMM_WORLD,&my_id);
87 
88   if (num_nodes> (INT_MAX >> 1))
89   {error_msg_fatal("Can't have more then MAX_INT/2 nodes!!!");}
90 
91   ivec_zero((int*)edge_node,INT_LEN*32);
92 
93   floor_num_nodes = 1;
94   i_log2_num_nodes = modfl_num_nodes = 0;
95   while (floor_num_nodes <= num_nodes)
96     {
97       edge_node[i_log2_num_nodes] = my_id ^ floor_num_nodes;
98       floor_num_nodes <<= 1;
99       i_log2_num_nodes++;
100     }
101 
102   i_log2_num_nodes--;
103   floor_num_nodes >>= 1;
104   modfl_num_nodes = (num_nodes - floor_num_nodes);
105 
106   if ((my_id > 0) && (my_id <= modfl_num_nodes))
107     {edge_not_pow_2=((my_id|floor_num_nodes)-1);}
108   else if (my_id >= floor_num_nodes)
109     {edge_not_pow_2=((my_id^floor_num_nodes)+1);
110     }
111   else
112     {edge_not_pow_2 = 0;}
113 
114 #ifdef DEBUG
115   error_msg_warning("c_init() done :: my_id=%d, num_nodes=%d",my_id,num_nodes);
116 #endif
117 }
118 
119 
120 
121 /***********************************comm.c*************************************
122 Function: giop()
123 
124 Input :
125 Output:
126 Return:
127 Description: fan-in/out version
128 ***********************************comm.c*************************************/
129 void
130 giop(int *vals, int *work, int n, int *oprs)
131 {
132   register int mask, edge;
133   int type, dest;
134   vfp fp;
135 #if defined MPISRC
136   MPI_Status  status;
137 #elif defined NXSRC
138   int len;
139 #endif
140 
141 
142 #ifdef SAFE
143   /* ok ... should have some data, work, and operator(s) */
144   if (!vals||!work||!oprs)
145     {error_msg_fatal("giop() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
146 
147   /* non-uniform should have at least two entries */
148   if ((oprs[0] == NON_UNIFORM)&&(n<2))
149     {error_msg_fatal("giop() :: non_uniform and n=0,1?");}
150 
151   /* check to make sure comm package has been initialized */
152   if (!p_init)
153     {comm_init();}
154 #endif
155 
156   /* if there's nothing to do return */
157   if ((num_nodes<2)||(!n))
158     {
159 #ifdef DEBUG
160       error_msg_warning("giop() :: n=%d num_nodes=%d",n,num_nodes);
161 #endif
162       return;
163     }
164 
165   /* a negative number if items to send ==> fatal */
166   if (n<0)
167     {error_msg_fatal("giop() :: n=%d<0?",n);}
168 
169   /* advance to list of n operations for custom */
170   if ((type=oprs[0])==NON_UNIFORM)
171     {oprs++;}
172 
173   /* major league hack */
174   if (!(fp = (vfp) ivec_fct_addr(type))) {
175     error_msg_warning("giop() :: hope you passed in a rbfp!\n");
176     fp = (vfp) oprs;
177   }
178 
179 #if  defined NXSRC
180   /* all msgs will be of the same length */
181   len = n*INT_LEN;
182 
183   /* if not a hypercube must colapse partial dim */
184   if (edge_not_pow_2)
185     {
186       if (my_id >= floor_num_nodes)
187 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
188       else
189 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
190     }
191 
192   /* implement the mesh fan in/out exchange algorithm */
193   if (my_id<floor_num_nodes)
194     {
195       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
196 	{
197 	  dest = my_id^mask;
198 	  if (my_id > dest)
199 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
200 	  else
201 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
202 	}
203 
204       mask=floor_num_nodes>>1;
205       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
206 	{
207 	  if (my_id%mask)
208 	    {continue;}
209 
210 	  dest = my_id^mask;
211 	  if (my_id < dest)
212 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
213 	  else
214 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
215 	}
216     }
217 
218   /* if not a hypercube must expand to partial dim */
219   if (edge_not_pow_2)
220     {
221       if (my_id >= floor_num_nodes)
222 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
223       else
224 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
225     }
226 
227 #elif defined MPISRC
228   /* all msgs will be of the same length */
229   /* if not a hypercube must colapse partial dim */
230   if (edge_not_pow_2)
231     {
232       if (my_id >= floor_num_nodes)
233 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG0+my_id,MPI_COMM_WORLD);}
234       else
235 	{
236 	  MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
237 		   MPI_COMM_WORLD,&status);
238 	  (*fp)(vals,work,n,oprs);
239 	}
240     }
241 
242   /* implement the mesh fan in/out exchange algorithm */
243   if (my_id<floor_num_nodes)
244     {
245       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
246 	{
247 	  dest = my_id^mask;
248 	  if (my_id > dest)
249 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
250 	  else
251 	    {
252 	      MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG2+dest,
253 		       MPI_COMM_WORLD, &status);
254 	      (*fp)(vals, work, n, oprs);
255 	    }
256 	}
257 
258       mask=floor_num_nodes>>1;
259       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
260 	{
261 	  if (my_id%mask)
262 	    {continue;}
263 
264 	  dest = my_id^mask;
265 	  if (my_id < dest)
266 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
267 	  else
268 	    {
269 	      MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG4+dest,
270 		       MPI_COMM_WORLD, &status);
271 	    }
272 	}
273     }
274 
275   /* if not a hypercube must expand to partial dim */
276   if (edge_not_pow_2)
277     {
278       if (my_id >= floor_num_nodes)
279 	{
280 	  MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
281 		   MPI_COMM_WORLD,&status);
282 	}
283       else
284 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG5+my_id,MPI_COMM_WORLD);}
285     }
286 #else
287   return;
288 #endif
289 }
290 
291 /***********************************comm.c*************************************
292 Function: grop()
293 
294 Input :
295 Output:
296 Return:
297 Description: fan-in/out version
298 ***********************************comm.c*************************************/
299 void
300 grop(REAL *vals, REAL *work, int n, int *oprs)
301 {
302   register int mask, edge;
303   int type, dest;
304   vfp fp;
305 #if defined MPISRC
306   MPI_Status  status;
307 #elif defined NXSRC
308   int len;
309 #endif
310 
311 
312 #ifdef SAFE
313   /* ok ... should have some data, work, and operator(s) */
314   if (!vals||!work||!oprs)
315     {error_msg_fatal("grop() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
316 
317   /* non-uniform should have at least two entries */
318   if ((oprs[0] == NON_UNIFORM)&&(n<2))
319     {error_msg_fatal("grop() :: non_uniform and n=0,1?");}
320 
321   /* check to make sure comm package has been initialized */
322   if (!p_init)
323     {comm_init();}
324 #endif
325 
326   /* if there's nothing to do return */
327   if ((num_nodes<2)||(!n))
328     {return;}
329 
330   /* a negative number of items to send ==> fatal */
331   if (n<0)
332     {error_msg_fatal("gdop() :: n=%d<0?",n);}
333 
334   /* advance to list of n operations for custom */
335   if ((type=oprs[0])==NON_UNIFORM)
336     {oprs++;}
337 
338   if (!(fp = (vfp) rvec_fct_addr(type))) {
339     error_msg_warning("grop() :: hope you passed in a rbfp!\n");
340     fp = (vfp) oprs;
341   }
342 
343 #if  defined NXSRC
344   /* all msgs will be of the same length */
345   len = n*REAL_LEN;
346 
347   /* if not a hypercube must colapse partial dim */
348   if (edge_not_pow_2)
349     {
350       if (my_id >= floor_num_nodes)
351 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
352       else
353 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
354     }
355 
356   /* implement the mesh fan in/out exchange algorithm */
357   if (my_id<floor_num_nodes)
358     {
359       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
360 	{
361 	  dest = my_id^mask;
362 	  if (my_id > dest)
363 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
364 	  else
365 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
366 	}
367 
368       mask=floor_num_nodes>>1;
369       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
370 	{
371 	  if (my_id%mask)
372 	    {continue;}
373 
374 	  dest = my_id^mask;
375 	  if (my_id < dest)
376 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
377 	  else
378 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
379 	}
380     }
381 
382   /* if not a hypercube must expand to partial dim */
383   if (edge_not_pow_2)
384     {
385       if (my_id >= floor_num_nodes)
386 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
387       else
388 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
389     }
390 
391 #elif defined MPISRC
392   /* all msgs will be of the same length */
393   /* if not a hypercube must colapse partial dim */
394   if (edge_not_pow_2)
395     {
396       if (my_id >= floor_num_nodes)
397 	{MPI_Send(vals,n,REAL_TYPE,edge_not_pow_2,MSGTAG0+my_id,
398 		  MPI_COMM_WORLD);}
399       else
400 	{
401 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
402 		   MPI_COMM_WORLD,&status);
403 	  (*fp)(vals,work,n,oprs);
404 	}
405     }
406 
407   /* implement the mesh fan in/out exchange algorithm */
408   if (my_id<floor_num_nodes)
409     {
410       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
411 	{
412 	  dest = my_id^mask;
413 	  if (my_id > dest)
414 	    {MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
415 	  else
416 	    {
417 	      MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,
418 		       MPI_COMM_WORLD, &status);
419 	      (*fp)(vals, work, n, oprs);
420 	    }
421 	}
422 
423       mask=floor_num_nodes>>1;
424       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
425 	{
426 	  if (my_id%mask)
427 	    {continue;}
428 
429 	  dest = my_id^mask;
430 	  if (my_id < dest)
431 	    {MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
432 	  else
433 	    {
434 	      MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,
435 		       MPI_COMM_WORLD, &status);
436 	    }
437 	}
438     }
439 
440   /* if not a hypercube must expand to partial dim */
441   if (edge_not_pow_2)
442     {
443       if (my_id >= floor_num_nodes)
444 	{
445 	  MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
446 		   MPI_COMM_WORLD,&status);
447 	}
448       else
449 	{MPI_Send(vals,n,REAL_TYPE,edge_not_pow_2,MSGTAG5+my_id,
450 		  MPI_COMM_WORLD);}
451     }
452 #else
453   return;
454 #endif
455 }
456 
457 
458 /***********************************comm.c*************************************
459 Function: grop()
460 
461 Input :
462 Output:
463 Return:
464 Description: fan-in/out version
465 
466 note good only for num_nodes=2^k!!!
467 
468 ***********************************comm.c*************************************/
469 void
470 grop_hc(REAL *vals, REAL *work, int n, int *oprs, int dim)
471 {
472   register int mask, edge;
473   int type, dest;
474   vfp fp;
475 #if defined MPISRC
476   MPI_Status  status;
477 #elif defined NXSRC
478   int len;
479 #endif
480 
481 
482 #ifdef SAFE
483   /* ok ... should have some data, work, and operator(s) */
484   if (!vals||!work||!oprs)
485     {error_msg_fatal("grop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
486 
487   /* non-uniform should have at least two entries */
488   if ((oprs[0] == NON_UNIFORM)&&(n<2))
489     {error_msg_fatal("grop_hc() :: non_uniform and n=0,1?");}
490 
491   /* check to make sure comm package has been initialized */
492   if (!p_init)
493     {comm_init();}
494 #endif
495 
496   /* if there's nothing to do return */
497   if ((num_nodes<2)||(!n)||(dim<=0))
498     {return;}
499 
500   /* the error msg says it all!!! */
501   if (modfl_num_nodes)
502     {error_msg_fatal("grop_hc() :: num_nodes not a power of 2!?!");}
503 
504   /* a negative number of items to send ==> fatal */
505   if (n<0)
506     {error_msg_fatal("grop_hc() :: n=%d<0?",n);}
507 
508   /* can't do more dimensions then exist */
509   dim = MIN(dim,i_log2_num_nodes);
510 
511   /* advance to list of n operations for custom */
512   if ((type=oprs[0])==NON_UNIFORM)
513     {oprs++;}
514 
515   if (!(fp = (vfp) rvec_fct_addr(type))) {
516     error_msg_warning("grop_hc() :: hope you passed in a rbfp!\n");
517     fp = (vfp) oprs;
518   }
519 
520 #if  defined NXSRC
521   /* all msgs will be of the same length */
522   len = n*REAL_LEN;
523 
524   /* implement the mesh fan in/out exchange algorithm */
525   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
526     {
527       dest = my_id^mask;
528       if (my_id > dest)
529 	{csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
530       else
531 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
532     }
533 
534   /* for (mask=1, edge=1; edge<dim; edge++,mask<<=1) {;} */
535   if (edge==dim)
536     {mask>>=1;}
537   else
538     {while (++edge<dim) {mask<<=1;}}
539 
540   for (edge=0; edge<dim; edge++,mask>>=1)
541     {
542       if (my_id%mask)
543 	{continue;}
544 
545       dest = my_id^mask;
546       if (my_id < dest)
547 	{csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
548       else
549 	{crecv(MSGTAG4+dest,(char *)vals,len);}
550     }
551 
552 #elif defined MPISRC
553   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
554     {
555       dest = my_id^mask;
556       if (my_id > dest)
557 	{MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
558       else
559 	{
560 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,MPI_COMM_WORLD,
561 		   &status);
562 	  (*fp)(vals, work, n, oprs);
563 	}
564     }
565 
566   if (edge==dim)
567     {mask>>=1;}
568   else
569     {while (++edge<dim) {mask<<=1;}}
570 
571   for (edge=0; edge<dim; edge++,mask>>=1)
572     {
573       if (my_id%mask)
574 	{continue;}
575 
576       dest = my_id^mask;
577       if (my_id < dest)
578 	{MPI_Send(vals,n,REAL_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
579       else
580 	{
581 	  MPI_Recv(vals,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,MPI_COMM_WORLD,
582 		   &status);
583 	}
584     }
585 #else
586   return;
587 #endif
588 }
589 
590 
591 /***********************************comm.c*************************************
592 Function: gop()
593 
594 Input :
595 Output:
596 Return:
597 Description: fan-in/out version
598 ***********************************comm.c*************************************/
599 void
600 gfop(void *vals, void *work, int n, vbfp fp, DATA_TYPE dt, int comm_type)
601 {
602   register int mask, edge;
603   int dest;
604 #if defined MPISRC
605   MPI_Status  status;
606   MPI_Op op;
607 #elif defined NXSRC
608   int len;
609 #endif
610 
611 
612 #ifdef SAFE
613   /* check to make sure comm package has been initialized */
614   if (!p_init)
615     {comm_init();}
616 
617   /* ok ... should have some data, work, and operator(s) */
618   if (!vals||!work||!fp)
619     {error_msg_fatal("gop() :: v=%d, w=%d, f=%d",vals,work,fp);}
620 #endif
621 
622   /* if there's nothing to do return */
623   if ((num_nodes<2)||(!n))
624     {return;}
625 
626   /* a negative number of items to send ==> fatal */
627   if (n<0)
628     {error_msg_fatal("gop() :: n=%d<0?",n);}
629 
630 #if  defined NXSRC
631   switch (dt) {
632   case REAL_TYPE:
633     len = n*REAL_LEN;
634     break;
635   case INT_TYPE:
636     len = n*INT_LEN;
637     break;
638   default:
639     error_msg_fatal("gop() :: unrecognized datatype!!!\n");
640   }
641 
642   /* if not a hypercube must colapse partial dim */
643   if (edge_not_pow_2)
644     {
645       if (my_id >= floor_num_nodes)
646 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
647       else
648 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,dt);}
649     }
650 
651   /* implement the mesh fan in/out exchange algorithm */
652   if (my_id<floor_num_nodes)
653     {
654       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
655 	{
656 	  dest = my_id^mask;
657 	  if (my_id > dest)
658 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
659 	  else
660 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals,work,n,dt);}
661 	}
662 
663       mask=floor_num_nodes>>1;
664       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
665 	{
666 	  if (my_id%mask)
667 	    {continue;}
668 
669 	  dest = my_id^mask;
670 	  if (my_id < dest)
671 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
672 	  else
673 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
674 	}
675     }
676 
677   /* if not a hypercube must expand to partial dim */
678   if (edge_not_pow_2)
679     {
680       if (my_id >= floor_num_nodes)
681 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
682       else
683 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
684     }
685 
686 #elif defined MPISRC
687 
688   if (comm_type==MPI)
689     {
690       MPI_Op_create(fp,TRUE,&op);
691       MPI_Allreduce (vals, work, n, dt, op, MPI_COMM_WORLD);
692       MPI_Op_free(&op);
693       return;
694     }
695 
696   /* if not a hypercube must colapse partial dim */
697   if (edge_not_pow_2)
698     {
699       if (my_id >= floor_num_nodes)
700 	{MPI_Send(vals,n,dt,edge_not_pow_2,MSGTAG0+my_id,
701 		  MPI_COMM_WORLD);}
702       else
703 	{
704 	  MPI_Recv(work,n,dt,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
705 		   MPI_COMM_WORLD,&status);
706 	  (*fp)(vals,work,&n,&dt);
707 	}
708     }
709 
710   /* implement the mesh fan in/out exchange algorithm */
711   if (my_id<floor_num_nodes)
712     {
713       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
714 	{
715 	  dest = my_id^mask;
716 	  if (my_id > dest)
717 	    {MPI_Send(vals,n,dt,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
718 	  else
719 	    {
720 	      MPI_Recv(work,n,dt,MPI_ANY_SOURCE,MSGTAG2+dest,
721 		       MPI_COMM_WORLD, &status);
722 	      (*fp)(vals, work, &n, &dt);
723 	    }
724 	}
725 
726       mask=floor_num_nodes>>1;
727       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
728 	{
729 	  if (my_id%mask)
730 	    {continue;}
731 
732 	  dest = my_id^mask;
733 	  if (my_id < dest)
734 	    {MPI_Send(vals,n,dt,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
735 	  else
736 	    {
737 	      MPI_Recv(vals,n,dt,MPI_ANY_SOURCE,MSGTAG4+dest,
738 		       MPI_COMM_WORLD, &status);
739 	    }
740 	}
741     }
742 
743   /* if not a hypercube must expand to partial dim */
744   if (edge_not_pow_2)
745     {
746       if (my_id >= floor_num_nodes)
747 	{
748 	  MPI_Recv(vals,n,dt,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
749 		   MPI_COMM_WORLD,&status);
750 	}
751       else
752 	{MPI_Send(vals,n,dt,edge_not_pow_2,MSGTAG5+my_id,
753 		  MPI_COMM_WORLD);}
754     }
755 #else
756   return;
757 #endif
758 }
759 
760 
761 
762 
763 
764 
765 /******************************************************************************
766 Function: giop()
767 
768 Input :
769 Output:
770 Return:
771 Description:
772 
773 ii+1 entries in seg :: 0 .. ii
774 
775 ******************************************************************************/
776 void
777 ssgl_radd(register REAL *vals, register REAL *work, register int level,
778 	  register int *segs)
779 {
780   register int edge, type, dest, mask;
781   register int stage_n;
782 #if defined MPISRC
783   MPI_Status  status;
784 #endif
785 
786 
787 #ifdef DEBUG
788   if (level > i_log2_num_nodes)
789     {error_msg_fatal("sgl_add() :: level > log_2(P)!!!");}
790 #endif
791 
792 #ifdef SAFE
793   /* check to make sure comm package has been initialized */
794   if (!p_init)
795     {comm_init();}
796 #endif
797 
798 
799 #if defined NXSRC
800   /* all msgs are *NOT* the same length */
801   /* implement the mesh fan in/out exchange algorithm */
802   for (mask=0, edge=0; edge<level; edge++, mask++)
803     {
804       stage_n = (segs[level] - segs[edge]);
805       if (stage_n && !(my_id & mask))
806 	{
807 	  dest = edge_node[edge];
808 	  type = MSGTAG3 + my_id + (num_nodes*edge);
809 	  if (my_id>dest)
810 	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);}
811 	  else
812 	    {
813 	      type =  type - my_id + dest;
814 	      crecv(type,work,stage_n*REAL_LEN);
815 	      rvec_add(vals+segs[edge], work, stage_n);
816 /*            daxpy(vals+segs[edge], work, stage_n); */
817 	    }
818 	}
819       mask <<= 1;
820     }
821   mask>>=1;
822   for (edge=0; edge<level; edge++)
823     {
824       stage_n = (segs[level] - segs[level-1-edge]);
825       if (stage_n && !(my_id & mask))
826 	{
827 	  dest = edge_node[level-edge-1];
828 	  type = MSGTAG6 + my_id + (num_nodes*edge);
829 	  if (my_id<dest)
830 	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);}
831 	  else
832 	    {
833 	      type =  type - my_id + dest;
834 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN);
835 	    }
836 	}
837       mask >>= 1;
838     }
839 
840 #elif defined MPISRC
841   /* all msgs are *NOT* the same length */
842   /* implement the mesh fan in/out exchange algorithm */
843   for (mask=0, edge=0; edge<level; edge++, mask++)
844     {
845       stage_n = (segs[level] - segs[edge]);
846       if (stage_n && !(my_id & mask))
847 	{
848 	  dest = edge_node[edge];
849 	  type = MSGTAG3 + my_id + (num_nodes*edge);
850 	  if (my_id>dest)
851 /*	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);} */
852           {MPI_Send(vals+segs[edge],stage_n,REAL_TYPE,dest,type,
853                       MPI_COMM_WORLD);}
854 	  else
855 	    {
856 	      type =  type - my_id + dest;
857 /*	      crecv(type,work,stage_n*REAL_LEN); */
858               MPI_Recv(work,stage_n,REAL_TYPE,MPI_ANY_SOURCE,type,
859                        MPI_COMM_WORLD,&status);
860 	      rvec_add(vals+segs[edge], work, stage_n);
861 /*            daxpy(vals+segs[edge], work, stage_n); */
862 	    }
863 	}
864       mask <<= 1;
865     }
866   mask>>=1;
867   for (edge=0; edge<level; edge++)
868     {
869       stage_n = (segs[level] - segs[level-1-edge]);
870       if (stage_n && !(my_id & mask))
871 	{
872 	  dest = edge_node[level-edge-1];
873 	  type = MSGTAG6 + my_id + (num_nodes*edge);
874 	  if (my_id<dest)
875 /*	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);} */
876             {MPI_Send(vals+segs[level-1-edge],stage_n,REAL_TYPE,dest,type,
877                       MPI_COMM_WORLD);}
878 	  else
879 	    {
880 	      type =  type - my_id + dest;
881 /* 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN); */
882               MPI_Recv(vals+segs[level-1-edge],stage_n,REAL_TYPE,
883                        MPI_ANY_SOURCE,type,MPI_COMM_WORLD,&status);
884 	    }
885 	}
886       mask >>= 1;
887     }
888 #else
889   return;
890 #endif
891 }
892 
893 
894 
895 /***********************************comm.c*************************************
896 Function: grop_hc_vvl()
897 
898 Input :
899 Output:
900 Return:
901 Description: fan-in/out version
902 
903 note good only for num_nodes=2^k!!!
904 
905 ***********************************comm.c*************************************/
906 void
907 grop_hc_vvl(REAL *vals, REAL *work, int *segs, int *oprs, int dim)
908 {
909   register int mask, edge, n;
910   int type, dest;
911   vfp fp;
912 #if defined MPISRC
913   MPI_Status  status;
914 #elif defined NXSRC
915   int len;
916 #endif
917 
918 
919   error_msg_fatal("grop_hc_vvl() :: is not working!\n");
920 
921 #ifdef SAFE
922   /* ok ... should have some data, work, and operator(s) */
923   if (!vals||!work||!oprs||!segs)
924     {error_msg_fatal("grop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
925 
926   /* non-uniform should have at least two entries */
927 #if defined(not_used)
928   if ((oprs[0] == NON_UNIFORM)&&(n<2))
929     {error_msg_fatal("grop_hc() :: non_uniform and n=0,1?");}
930 #endif
931 
932   /* check to make sure comm package has been initialized */
933   if (!p_init)
934     {comm_init();}
935 #endif
936 
937   /* if there's nothing to do return */
938   if ((num_nodes<2)||(dim<=0))
939     {return;}
940 
941   /* the error msg says it all!!! */
942   if (modfl_num_nodes)
943     {error_msg_fatal("grop_hc() :: num_nodes not a power of 2!?!");}
944 
945   /* can't do more dimensions then exist */
946   dim = MIN(dim,i_log2_num_nodes);
947 
948   /* advance to list of n operations for custom */
949   if ((type=oprs[0])==NON_UNIFORM)
950     {oprs++;}
951 
952   if (!(fp = (vfp) rvec_fct_addr(type))){
953     error_msg_warning("grop_hc() :: hope you passed in a rbfp!\n");
954     fp = (vfp) oprs;
955   }
956 
957 #if  defined NXSRC
958   /* all msgs are *NOT* the same length */
959   /* implement the mesh fan in/out exchange algorithm */
960   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
961     {
962       n = segs[dim]-segs[edge];
963       /*
964       error_msg_warning("n=%d, segs[%d]=%d, segs[%d]=%d\n",n,dim,segs[dim],
965 			edge,segs[edge]);
966       */
967       len = n*REAL_LEN;
968       dest = my_id^mask;
969       if (my_id > dest)
970 	{csend(MSGTAG2+my_id,(char *)vals+segs[edge],len,dest,0); break;}
971       else
972 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals+segs[edge], work, n, oprs);}
973     }
974 
975   if (edge==dim)
976     {mask>>=1;}
977   else
978     {while (++edge<dim) {mask<<=1;}}
979 
980   for (edge=0; edge<dim; edge++,mask>>=1)
981     {
982       if (my_id%mask)
983 	{continue;}
984       len = (segs[dim]-segs[dim-1-edge])*REAL_LEN;
985       /*
986       error_msg_warning("n=%d, segs[%d]=%d, segs[%d]=%d\n",n,dim,segs[dim],
987                          dim-1-edge,segs[dim-1-edge]);
988       */
989       dest = my_id^mask;
990       if (my_id < dest)
991 	{csend(MSGTAG4+my_id,(char *)vals+segs[dim-1-edge],len,dest,0);}
992       else
993 	{crecv(MSGTAG4+dest,(char *)vals+segs[dim-1-edge],len);}
994     }
995 
996 #elif defined MPISRC
997   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
998     {
999       n = segs[dim]-segs[edge];
1000       dest = my_id^mask;
1001       if (my_id > dest)
1002 	{MPI_Send(vals+segs[edge],n,REAL_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
1003       else
1004 	{
1005 	  MPI_Recv(work,n,REAL_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,
1006 		   MPI_COMM_WORLD, &status);
1007 	  (*fp)(vals+segs[edge], work, n, oprs);
1008 	}
1009     }
1010 
1011   if (edge==dim)
1012     {mask>>=1;}
1013   else
1014     {while (++edge<dim) {mask<<=1;}}
1015 
1016   for (edge=0; edge<dim; edge++,mask>>=1)
1017     {
1018       if (my_id%mask)
1019 	{continue;}
1020 
1021       n = (segs[dim]-segs[dim-1-edge]);
1022 
1023       dest = my_id^mask;
1024       if (my_id < dest)
1025 	{MPI_Send(vals+segs[dim-1-edge],n,REAL_TYPE,dest,MSGTAG4+my_id,
1026 		  MPI_COMM_WORLD);}
1027       else
1028 	{
1029 	  MPI_Recv(vals+segs[dim-1-edge],n,REAL_TYPE,MPI_ANY_SOURCE,
1030 		   MSGTAG4+dest,MPI_COMM_WORLD, &status);
1031 	}
1032     }
1033 #else
1034   return;
1035 #endif
1036 }
1037 
1038 
1039 #ifdef INPROG
1040 
1041 /***********************************comm.c*************************************
1042 Function: proc_sync()
1043 
1044 Input :
1045 Output:
1046 Return:
1047 Description: hc bassed version
1048 ***********************************comm.c*************************************/
1049 void
1050 proc_sync()
1051 {
1052   register int mask, edge;
1053   int type, dest;
1054 #if defined MPISRC
1055   MPI_Status  status;
1056 #endif
1057 
1058 
1059 #ifdef DEBUG
1060   error_msg_warning("begin proc_sync()\n");
1061 #endif
1062 
1063 #ifdef SAFE
1064   /* check to make sure comm package has been initialized */
1065   if (!p_init)
1066     {comm_init();}
1067 #endif
1068 
1069 #if  defined NXSRC
1070   /* all msgs will be of zero length */
1071 
1072   /* if not a hypercube must colapse partial dim */
1073   if (edge_not_pow_2)
1074     {
1075       if (my_id >= floor_num_nodes)
1076 	{csend(MSGTAG0+my_id,(char *)vals,len,edge_not_pow_2,0);}
1077       else
1078 	{crecv(MSGTAG0+edge_not_pow_2,(char *)work,len); (*fp)(vals,work,n,oprs);}
1079     }
1080 
1081   /* implement the mesh fan in/out exchange algorithm */
1082   if (my_id<floor_num_nodes)
1083     {
1084       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
1085 	{
1086 	  dest = my_id^mask;
1087 	  if (my_id > dest)
1088 	    {csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
1089 	  else
1090 	    {crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
1091 	}
1092 
1093       mask=floor_num_nodes>>1;
1094       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
1095 	{
1096 	  if (my_id%mask)
1097 	    {continue;}
1098 
1099 	  dest = my_id^mask;
1100 	  if (my_id < dest)
1101 	    {csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
1102 	  else
1103 	    {crecv(MSGTAG4+dest,(char *)vals,len);}
1104 	}
1105     }
1106 
1107   /* if not a hypercube must expand to partial dim */
1108   if (edge_not_pow_2)
1109     {
1110       if (my_id >= floor_num_nodes)
1111 	{crecv(MSGTAG5+edge_not_pow_2,(char *)vals,len);}
1112       else
1113 	{csend(MSGTAG5+my_id,(char *)vals,len,edge_not_pow_2,0);}
1114     }
1115 
1116 #elif defined MPISRC
1117   /* all msgs will be of the same length */
1118   /* if not a hypercube must colapse partial dim */
1119   if (edge_not_pow_2)
1120     {
1121       if (my_id >= floor_num_nodes)
1122 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG0+my_id,MPI_COMM_WORLD);}
1123       else
1124 	{
1125 	  MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG0+edge_not_pow_2,
1126 		   MPI_COMM_WORLD,&status);
1127 	  (*fp)(vals,work,n,oprs);
1128 	}
1129     }
1130 
1131   /* implement the mesh fan in/out exchange algorithm */
1132   if (my_id<floor_num_nodes)
1133     {
1134       for (mask=1,edge=0; edge<i_log2_num_nodes; edge++,mask<<=1)
1135 	{
1136 	  dest = my_id^mask;
1137 	  if (my_id > dest)
1138 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
1139 	  else
1140 	    {
1141 	      MPI_Recv(work,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG2+dest,
1142 		       MPI_COMM_WORLD, &status);
1143 	      (*fp)(vals, work, n, oprs);
1144 	    }
1145 	}
1146 
1147       mask=floor_num_nodes>>1;
1148       for (edge=0; edge<i_log2_num_nodes; edge++,mask>>=1)
1149 	{
1150 	  if (my_id%mask)
1151 	    {continue;}
1152 
1153 	  dest = my_id^mask;
1154 	  if (my_id < dest)
1155 	    {MPI_Send(vals,n,MPI_INT,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
1156 	  else
1157 	    {
1158 	      MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG4+dest,
1159 		       MPI_COMM_WORLD, &status);
1160 	    }
1161 	}
1162     }
1163 
1164   /* if not a hypercube must expand to partial dim */
1165   if (edge_not_pow_2)
1166     {
1167       if (my_id >= floor_num_nodes)
1168 	{
1169 	  MPI_Recv(vals,n,MPI_INT,MPI_ANY_SOURCE,MSGTAG5+edge_not_pow_2,
1170 		   MPI_COMM_WORLD,&status);
1171 	}
1172       else
1173 	{MPI_Send(vals,n,MPI_INT,edge_not_pow_2,MSGTAG5+my_id,MPI_COMM_WORLD);}
1174     }
1175 #else
1176   return;
1177 #endif
1178 }
1179 #endif
1180 
1181 
1182 /* hmt hack */
1183 PetscErrorCode hmt_xor_ (register int *i1, register int *i2)
1184 {
1185   return(*i1^*i2);
1186 }
1187 
1188 
1189 /******************************************************************************
1190 Function: giop()
1191 
1192 Input :
1193 Output:
1194 Return:
1195 Description:
1196 
1197 ii+1 entries in seg :: 0 .. ii
1198 
1199 ******************************************************************************/
1200 void
1201 staged_gs_ (register REAL *vals, register REAL *work, register int *level,
1202 	    register int *segs)
1203 {
1204   ssgl_radd(vals, work, *level, segs);
1205 }
1206 
1207 /******************************************************************************
1208 Function: giop()
1209 
1210 Input :
1211 Output:
1212 Return:
1213 Description:
1214 ******************************************************************************/
1215 void
1216 staged_iadd_ (int *gl_num, int *level)
1217 {
1218   sgl_iadd(gl_num,*level);
1219 }
1220 
1221 
1222 
1223 /******************************************************************************
1224 Function: giop()
1225 
1226 Input :
1227 Output:
1228 Return:
1229 Description:
1230 ******************************************************************************/
1231 static void
1232 sgl_iadd(int *vals, int level)
1233 {
1234   int edge, type, dest, source, len, mask = -1;
1235   int tmp, *work;
1236 
1237 
1238 #ifdef SAFE
1239   /* check to make sure comm package has been initialized */
1240   if (!p_init)
1241     {comm_init();}
1242 #endif
1243 
1244 
1245   /* all msgs will be of the same length */
1246   work = &tmp;
1247   len = INT_LEN;
1248 
1249   if (level > i_log2_num_nodes)
1250     {error_msg_fatal("sgl_add() :: level too big?");}
1251 
1252   if (level<=0)
1253     {return;}
1254 
1255 #if defined NXSRC
1256   {
1257     long msg_id;
1258     /* implement the mesh fan in/out exchange algorithm */
1259     if (my_id<floor_num_nodes)
1260       {
1261 	mask = 0;
1262 	for (edge = 0; edge < level; edge++)
1263 	  {
1264 	    if (!(my_id & mask))
1265 	      {
1266 		source = dest = edge_node[edge];
1267 		type = MSGTAG1 + my_id + (num_nodes*edge);
1268 		if (my_id > dest)
1269 		  {
1270 		    msg_id = isend(type,vals,len,dest,0);
1271 		    msgwait(msg_id);
1272 		  }
1273 		else
1274 		  {
1275 		    type =  type - my_id + source;
1276 		    msg_id = irecv(type,work,len);
1277 		    msgwait(msg_id);
1278 		    vals[0] += work[0];
1279 		  }
1280 	      }
1281 	    mask <<= 1;
1282 	    mask += 1;
1283 	  }
1284       }
1285 
1286     if (my_id<floor_num_nodes)
1287       {
1288 	mask >>= 1;
1289 	for (edge = 0; edge < level; edge++)
1290 	  {
1291 	    if (!(my_id & mask))
1292 	      {
1293 		source = dest = edge_node[level-edge-1];
1294 		type = MSGTAG1 + my_id + (num_nodes*edge);
1295 		if (my_id < dest)
1296 		  {
1297 		    msg_id = isend(type,vals,len,dest,0);
1298 		    msgwait(msg_id);
1299 		  }
1300 		else
1301 		  {
1302 		    type =  type - my_id + source;
1303 		    msg_id = irecv(type,work,len);
1304 		    msgwait(msg_id);
1305 		    vals[0] = work[0];
1306 		  }
1307 	      }
1308 	    mask >>= 1;
1309 	  }
1310       }
1311   }
1312 #elif defined MPISRC
1313   {
1314     MPI_Request msg_id;
1315     MPI_Status status;
1316 
1317     /* implement the mesh fan in/out exchange algorithm */
1318     if (my_id<floor_num_nodes)
1319       {
1320 	mask = 0;
1321 	for (edge = 0; edge < level; edge++)
1322 	  {
1323 	    if (!(my_id & mask))
1324 	      {
1325 		source = dest = edge_node[edge];
1326 		type = MSGTAG1 + my_id + (num_nodes*edge);
1327 		if (my_id > dest)
1328 		  {
1329 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1330 			      &msg_id);
1331 		    MPI_Wait(&msg_id,&status);
1332 		    /* msg_id = isend(type,vals,len,dest,0);
1333 		       msgwait(msg_id); */
1334 		  }
1335 		else
1336 		  {
1337 		    type =  type - my_id + source;
1338 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1339 			      type,MPI_COMM_WORLD,&msg_id);
1340 		    MPI_Wait(&msg_id,&status);
1341 		    /* msg_id = irecv(type,work,len);
1342 		       msgwait(msg_id); */
1343 		    vals[0] += work[0];
1344 		  }
1345 	      }
1346 	    mask <<= 1;
1347 	    mask += 1;
1348 	  }
1349       }
1350 
1351     if (my_id<floor_num_nodes)
1352       {
1353 	mask >>= 1;
1354 	for (edge = 0; edge < level; edge++)
1355 	  {
1356 	    if (!(my_id & mask))
1357 	      {
1358 		source = dest = edge_node[level-edge-1];
1359 		type = MSGTAG1 + my_id + (num_nodes*edge);
1360 		if (my_id < dest)
1361 		  {
1362 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1363 			      &msg_id);
1364 		    MPI_Wait(&msg_id,&status);
1365 		    /* msg_id = isend(type,vals,len,dest,0);
1366 		    msgwait(msg_id);*/
1367 		  }
1368 		else
1369 		  {
1370 		    type =  type - my_id + source;
1371 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1372 			      type,MPI_COMM_WORLD,&msg_id);
1373 		    MPI_Wait(&msg_id,&status);
1374 		    /* msg_id = irecv(type,work,len);
1375 		    msgwait(msg_id); */
1376 		    vals[0] = work[0];
1377 		  }
1378 	      }
1379 	    mask >>= 1;
1380 	  }
1381       }
1382   }
1383 #else
1384   return;
1385 #endif
1386 
1387 }
1388 
1389 
1390 
1391 /******************************************************************************
1392 Function: giop()
1393 
1394 Input :
1395 Output:
1396 Return:
1397 Description:
1398 ******************************************************************************/
1399 void
1400 staged_radd_ (double *gl_num, int *level)
1401 {
1402   sgl_radd(gl_num,*level);
1403 }
1404 
1405 /******************************************************************************
1406 Function: giop()
1407 
1408 Input :
1409 Output:
1410 Return:
1411 Description:
1412 ******************************************************************************/
1413 static void
1414 sgl_radd(double *vals, int level)
1415 {
1416 #if defined NXSRC
1417   int edge, type, dest, source, len, mask;
1418   double tmp, *work;
1419 #endif
1420 
1421 #ifdef SAFE
1422   /* check to make sure comm package has been initialized */
1423   if (!p_init)
1424     {comm_init();}
1425 #endif
1426 
1427 
1428 #if defined NXSRC
1429   {
1430     long msg_id;
1431 
1432     /* all msgs will be of the same length */
1433     work = &tmp;
1434     len = REAL_LEN;
1435 
1436     if (level > i_log2_num_nodes)
1437       {error_msg_fatal("sgl_add() :: level too big?");}
1438 
1439     if (level<=0)
1440       {return;}
1441 
1442     /* implement the mesh fan in/out exchange algorithm */
1443     if (my_id<floor_num_nodes)
1444       {
1445 	mask = 0;
1446 	for (edge = 0; edge < level; edge++)
1447 	  {
1448 	    if (!(my_id & mask))
1449 	      {
1450 		source = dest = edge_node[edge];
1451 		type = MSGTAG3 + my_id + (num_nodes*edge);
1452 		if (my_id > dest)
1453 		  {
1454 		    msg_id = isend(type,vals,len,dest,0);
1455 		    msgwait(msg_id);
1456 		  }
1457 		else
1458 		  {
1459 		    type =  type - my_id + source;
1460 		    msg_id = irecv(type,work,len);
1461 		    msgwait(msg_id);
1462 		    vals[0] += work[0];
1463 		  }
1464 	      }
1465 	    mask <<= 1;
1466 	    mask += 1;
1467 	  }
1468       }
1469 
1470     if (my_id<floor_num_nodes)
1471       {
1472 	mask >>= 1;
1473 	for (edge = 0; edge < level; edge++)
1474 	  {
1475 	    if (!(my_id & mask))
1476 	      {
1477 		source = dest = edge_node[level-edge-1];
1478 		type = MSGTAG3 + my_id + (num_nodes*edge);
1479 		if (my_id < dest)
1480 		  {
1481 		    msg_id = isend(type,vals,len,dest,0);
1482 		    msgwait(msg_id);
1483 		  }
1484 		else
1485 		  {
1486 		    type =  type - my_id + source;
1487 		    msg_id = irecv(type,work,len);
1488 		    msgwait(msg_id);
1489 		    vals[0] = work[0];
1490 		  }
1491 	      }
1492 	    mask >>= 1;
1493 	  }
1494       }
1495   }
1496 #elif defined MRISRC
1497   {
1498     MPI_Request msg_id;
1499     MPI_Status status;
1500 
1501     /* all msgs will be of the same length */
1502     work = &tmp;
1503     len = REAL_LEN;
1504 
1505     if (level > i_log2_num_nodes)
1506       {error_msg_fatal("sgl_add() :: level too big?");}
1507 
1508     if (level<=0)
1509       {return;}
1510 
1511     /* implement the mesh fan in/out exchange algorithm */
1512     if (my_id<floor_num_nodes)
1513       {
1514 	mask = 0;
1515 	for (edge = 0; edge < level; edge++)
1516 	  {
1517 	    if (!(my_id & mask))
1518 	      {
1519 		source = dest = edge_node[edge];
1520 		type = MSGTAG3 + my_id + (num_nodes*edge);
1521 		if (my_id > dest)
1522 		  {
1523 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1524 			      &msg_id);
1525 		    MPI_Wait(&msg_id,&status);
1526 		    /*msg_id = isend(type,vals,len,dest,0);
1527 		    msgwait(msg_id);*/
1528 		  }
1529 		else
1530 		  {
1531 		    type =  type - my_id + source;
1532 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1533 			      type,MPI_COMM_WORLD,&msg_id);
1534 		    MPI_Wait(&msg_id,&status);
1535 		    /*msg_id = irecv(type,work,len);
1536 		    msgwait(msg_id); */
1537 		    vals[0] += work[0];
1538 		  }
1539 	      }
1540 	    mask <<= 1;
1541 	    mask += 1;
1542 	  }
1543       }
1544 
1545     if (my_id<floor_num_nodes)
1546       {
1547 	mask >>= 1;
1548 	for (edge = 0; edge < level; edge++)
1549 	  {
1550 	    if (!(my_id & mask))
1551 	      {
1552 		source = dest = edge_node[level-edge-1];
1553 		type = MSGTAG3 + my_id + (num_nodes*edge);
1554 		if (my_id < dest)
1555 		  {
1556 		    MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1557 			      &msg_id);
1558 		    MPI_Wait(&msg_id,&status);
1559 		    /* msg_id = isend(type,vals,len,dest,0);
1560 		    msgwait(msg_id); */
1561 		  }
1562 		else
1563 		  {
1564 		    type =  type - my_id + source;
1565 		    MPI_Irecv(work,len,INT_TYPE,MPI_ANY_SOURCE,
1566 			      type,MPI_COMM_WORLD,&msg_id);
1567 		    MPI_Wait(&msg_id,&status);
1568 		    /*msg_id = irecv(type,work,len);
1569 		    msgwait(msg_id); */
1570 		    vals[0] = work[0];
1571 		  }
1572 	      }
1573 	    mask >>= 1;
1574 	  }
1575       }
1576   }
1577 #else
1578   return;
1579 #endif
1580 }
1581 
1582 
1583 
1584 /******************************************************************************
1585 Function: giop()
1586 
1587 Input :
1588 Output:
1589 Return:
1590 Description:
1591 
1592 ii+1 entries in seg :: 0 .. ii
1593 ******************************************************************************/
1594 void
1595 hmt_concat_ (REAL *vals, REAL *work, int *size)
1596 {
1597   hmt_concat(vals, work, *size);
1598 }
1599 
1600 
1601 
1602 /******************************************************************************
1603 Function: giop()
1604 
1605 Input :
1606 Output:
1607 Return:
1608 Description:
1609 
1610 ii+1 entries in seg :: 0 .. ii
1611 
1612 ******************************************************************************/
1613 static void
1614 hmt_concat(REAL *vals, REAL *work, int size)
1615 {
1616 #if defined NXSRC
1617   int  mask,stage_n;
1618   int edge, type, dest, source, len;
1619   int offset;
1620   double *dptr;
1621 #endif
1622 
1623 #ifdef SAFE
1624   /* check to make sure comm package has been initialized */
1625   if (!p_init)
1626     {comm_init();}
1627 #endif
1628 
1629   /* all msgs are *NOT* the same length */
1630   /* implement the mesh fan in/out exchange algorithm */
1631   rvec_copy(work,vals,size);
1632 
1633 #if defined NXSRC
1634   mask = 0;
1635   stage_n = size;
1636   {
1637     long msg_id;
1638 
1639     dptr  = work+size;
1640     for (edge = 0; edge < i_log2_num_nodes; edge++)
1641       {
1642 	len = stage_n * REAL_LEN;
1643 
1644 	if (!(my_id & mask))
1645 	  {
1646 	    source = dest = edge_node[edge];
1647 	    type = MSGTAG3 + my_id + (num_nodes*edge);
1648 	    if (my_id > dest)
1649 	      {
1650 		msg_id = isend(type, work, len,dest,0);
1651 		msgwait(msg_id);
1652 	      }
1653 	    else
1654 	      {
1655 		type =  type - my_id + source;
1656 		msg_id = irecv(type, dptr,len);
1657 		msgwait(msg_id);
1658 	      }
1659 	  }
1660 
1661 #ifdef DEBUG_1
1662 	ierror_msg_warning_n0("stage_n = ",stage_n);
1663 #endif
1664 
1665 	dptr += stage_n;
1666 	stage_n <<=1;
1667 	mask <<= 1;
1668 	mask += 1;
1669       }
1670 
1671     size = stage_n;
1672     stage_n >>=1;
1673     dptr -= stage_n;
1674 
1675     mask >>= 1;
1676 
1677     for (edge = 0; edge < i_log2_num_nodes; edge++)
1678       {
1679 	len = (size-stage_n) * REAL_LEN;
1680 
1681 	if (!(my_id & mask) && stage_n)
1682 	  {
1683 	    source = dest = edge_node[i_log2_num_nodes-edge-1];
1684 	    type = MSGTAG6 + my_id + (num_nodes*edge);
1685 	    if (my_id < dest)
1686 	      {
1687 		msg_id = isend(type,dptr,len,dest,0);
1688 		msgwait(msg_id);
1689 	      }
1690 	    else
1691 	      {
1692 		type =  type - my_id + source;
1693 		msg_id = irecv(type,dptr,len);
1694 		msgwait(msg_id);
1695 	      }
1696 	  }
1697 
1698 #ifdef DEBUG_1
1699 	ierror_msg_warning_n0("size-stage_n = ",size-stage_n);
1700 #endif
1701 
1702 	stage_n >>= 1;
1703 	dptr -= stage_n;
1704 	mask >>= 1;
1705       }
1706   }
1707 #elif defined MRISRC
1708   {
1709     MPI_Request msg_id;
1710     MPI_Status status;
1711 
1712     dptr  = work+size;
1713     for (edge = 0; edge < i_log2_num_nodes; edge++)
1714       {
1715 	len = stage_n * REAL_LEN;
1716 
1717 	if (!(my_id & mask))
1718 	  {
1719 	    source = dest = edge_node[edge];
1720 	    type = MSGTAG3 + my_id + (num_nodes*edge);
1721 	    if (my_id > dest)
1722 	      {
1723 		MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1724 			  &msg_id);
1725 		MPI_Wait(&msg_id,&status);
1726 		/*msg_id = isend(type, work, len,dest,0);
1727 		  msgwait(msg_id);*/
1728 	      }
1729 	    else
1730 	      {
1731 		type =  type - my_id + source;
1732 		MPI_Irecv(dptr,len,INT_TYPE,MPI_ANY_SOURCE,
1733 			  type,MPI_COMM_WORLD,&msg_id);
1734 		MPI_Wait(&msg_id,&status);
1735 		/* msg_id = irecv(type, dptr,len);
1736 		msgwait(msg_id);*/
1737 	      }
1738 	  }
1739 
1740 #ifdef DEBUG_1
1741 	ierror_msg_warning_n0("stage_n = ",stage_n);
1742 #endif
1743 
1744 	dptr += stage_n;
1745 	stage_n <<=1;
1746 	mask <<= 1;
1747 	mask += 1;
1748       }
1749 
1750     size = stage_n;
1751     stage_n >>=1;
1752     dptr -= stage_n;
1753 
1754     mask >>= 1;
1755 
1756     for (edge = 0; edge < i_log2_num_nodes; edge++)
1757       {
1758 	len = (size-stage_n) * REAL_LEN;
1759 
1760 	if (!(my_id & mask) && stage_n)
1761 	  {
1762 	    source = dest = edge_node[i_log2_num_nodes-edge-1];
1763 	    type = MSGTAG6 + my_id + (num_nodes*edge);
1764 	    if (my_id < dest)
1765 	      {
1766 		MPI_Isend(vals,len,INT_TYPE,dest,type,MPI_COMM_WORLD,
1767 			  &msg_id);
1768 		MPI_Wait(&msg_id,&status);
1769 		/*msg_id = isend(type, work, len,dest,0);
1770 		msg_id = isend(type,dptr,len,dest,0); */
1771 		msgwait(msg_id);
1772 	      }
1773 	    else
1774 	      {
1775 		type =  type - my_id + source;
1776 		MPI_Irecv(dptr,len,INT_TYPE,MPI_ANY_SOURCE,
1777 			  type,MPI_COMM_WORLD,&msg_id);
1778 		MPI_Wait(&msg_id,&status);
1779 		/*msg_id = irecv(type,dptr,len);
1780 		msgwait(msg_id);*/
1781 	      }
1782 	  }
1783 
1784 #ifdef DEBUG_1
1785 	ierror_msg_warning_n0("size-stage_n = ",size-stage_n);
1786 #endif
1787 
1788 	stage_n >>= 1;
1789 	dptr -= stage_n;
1790 	mask >>= 1;
1791       }
1792   }
1793 #else
1794   return;
1795 #endif
1796 }
1797 
1798 
1799 
1800 /******************************************************************************
1801 Function: giop()
1802 
1803 Input :
1804 Output:
1805 Return:
1806 Description:
1807 
1808 ii+1 entries in seg :: 0 .. ii
1809 
1810 ******************************************************************************/
1811 void
1812 new_ssgl_radd(register REAL *vals, register REAL *work, register int level,
1813 #if defined MPISRC
1814 	  register int *segs, MPI_Comm ssgl_comm)
1815 #else
1816 	  register int *segs)
1817 #endif
1818 {
1819   register int edge, type, dest, mask;
1820   register int stage_n;
1821 #if defined MPISRC
1822   MPI_Status  status;
1823 #endif
1824 
1825 
1826 #ifdef DEBUG
1827   if (level > i_log2_num_nodes)
1828     {error_msg_fatal("sgl_add() :: level > log_2(P)!!!");}
1829 #endif
1830 
1831 #ifdef SAFE
1832   /* check to make sure comm package has been initialized */
1833   if (!p_init)
1834     {comm_init();}
1835 #endif
1836 
1837 
1838 #if defined NXSRC
1839   /* all msgs are *NOT* the same length */
1840   /* implement the mesh fan in/out exchange algorithm */
1841   for (mask=0, edge=0; edge<level; edge++, mask++)
1842     {
1843       stage_n = (segs[level] - segs[edge]);
1844       if (stage_n && !(my_id & mask))
1845 	{
1846 	  dest = edge_node[edge];
1847 	  type = MSGTAG3 + my_id + (num_nodes*edge);
1848 	  if (my_id>dest)
1849 	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);}
1850 	  else
1851 	    {
1852 	      type =  type - my_id + dest;
1853 	      crecv(type,work,stage_n*REAL_LEN);
1854 	      rvec_add(vals+segs[edge], work, stage_n);
1855 /*            daxpy(vals+segs[edge], work, stage_n); */
1856 	    }
1857 	}
1858       mask <<= 1;
1859     }
1860   mask>>=1;
1861   for (edge=0; edge<level; edge++)
1862     {
1863       stage_n = (segs[level] - segs[level-1-edge]);
1864       if (stage_n && !(my_id & mask))
1865 	{
1866 	  dest = edge_node[level-edge-1];
1867 	  type = MSGTAG6 + my_id + (num_nodes*edge);
1868 	  if (my_id<dest)
1869 	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);}
1870 	  else
1871 	    {
1872 	      type =  type - my_id + dest;
1873 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN);
1874 	    }
1875 	}
1876       mask >>= 1;
1877     }
1878 
1879 #elif defined MPISRC
1880   /* all msgs are *NOT* the same length */
1881   /* implement the mesh fan in/out exchange algorithm */
1882   for (mask=0, edge=0; edge<level; edge++, mask++)
1883     {
1884       stage_n = (segs[level] - segs[edge]);
1885       if (stage_n && !(my_id & mask))
1886 	{
1887 	  dest = edge_node[edge];
1888 	  type = MSGTAG3 + my_id + (num_nodes*edge);
1889 	  if (my_id>dest)
1890 /*	    {csend(type, vals+segs[edge],stage_n*REAL_LEN,dest,0);} */
1891           {MPI_Send(vals+segs[edge],stage_n,REAL_TYPE,dest,type,
1892                       ssgl_comm);}
1893 	  else
1894 	    {
1895 	      type =  type - my_id + dest;
1896 /*	      crecv(type,work,stage_n*REAL_LEN); */
1897               MPI_Recv(work,stage_n,REAL_TYPE,MPI_ANY_SOURCE,type,
1898                        ssgl_comm,&status);
1899 	      rvec_add(vals+segs[edge], work, stage_n);
1900 /*            daxpy(vals+segs[edge], work, stage_n); */
1901 	    }
1902 	}
1903       mask <<= 1;
1904     }
1905   mask>>=1;
1906   for (edge=0; edge<level; edge++)
1907     {
1908       stage_n = (segs[level] - segs[level-1-edge]);
1909       if (stage_n && !(my_id & mask))
1910 	{
1911 	  dest = edge_node[level-edge-1];
1912 	  type = MSGTAG6 + my_id + (num_nodes*edge);
1913 	  if (my_id<dest)
1914 /*	    {csend(type,vals+segs[level-1-edge],stage_n*REAL_LEN,dest,0);} */
1915             {MPI_Send(vals+segs[level-1-edge],stage_n,REAL_TYPE,dest,type,
1916                       ssgl_comm);}
1917 	  else
1918 	    {
1919 	      type =  type - my_id + dest;
1920 /* 	      crecv(type,vals+segs[level-1-edge],stage_n*REAL_LEN); */
1921               MPI_Recv(vals+segs[level-1-edge],stage_n,REAL_TYPE,
1922                        MPI_ANY_SOURCE,type,ssgl_comm,&status);
1923 	    }
1924 	}
1925       mask >>= 1;
1926     }
1927 #else
1928   return;
1929 #endif
1930 }
1931 
1932 
1933 
1934 /***********************************comm.c*************************************
1935 Function: giop()
1936 
1937 Input :
1938 Output:
1939 Return:
1940 Description: fan-in/out version
1941 
1942 note good only for num_nodes=2^k!!!
1943 
1944 ***********************************comm.c*************************************/
1945 void
1946 giop_hc(int *vals, int *work, int n, int *oprs, int dim)
1947 {
1948   register int mask, edge;
1949   int type, dest;
1950   vfp fp;
1951 #if defined MPISRC
1952   MPI_Status  status;
1953 #elif defined NXSRC
1954   int len;
1955 #endif
1956 
1957 
1958 #ifdef SAFE
1959   /* ok ... should have some data, work, and operator(s) */
1960   if (!vals||!work||!oprs)
1961     {error_msg_fatal("giop_hc() :: vals=%d, work=%d, oprs=%d",vals,work,oprs);}
1962 
1963   /* non-uniform should have at least two entries */
1964   if ((oprs[0] == NON_UNIFORM)&&(n<2))
1965     {error_msg_fatal("giop_hc() :: non_uniform and n=0,1?");}
1966 
1967   /* check to make sure comm package has been initialized */
1968   if (!p_init)
1969     {comm_init();}
1970 #endif
1971 
1972   /* if there's nothing to do return */
1973   if ((num_nodes<2)||(!n)||(dim<=0))
1974     {return;}
1975 
1976   /* the error msg says it all!!! */
1977   if (modfl_num_nodes)
1978     {error_msg_fatal("giop_hc() :: num_nodes not a power of 2!?!");}
1979 
1980   /* a negative number of items to send ==> fatal */
1981   if (n<0)
1982     {error_msg_fatal("giop_hc() :: n=%d<0?",n);}
1983 
1984   /* can't do more dimensions then exist */
1985   dim = MIN(dim,i_log2_num_nodes);
1986 
1987   /* advance to list of n operations for custom */
1988   if ((type=oprs[0])==NON_UNIFORM)
1989     {oprs++;}
1990 
1991   if (!(fp = (vfp) ivec_fct_addr(type))){
1992     error_msg_warning("giop_hc() :: hope you passed in a rbfp!\n");
1993     fp = (vfp) oprs;
1994   }
1995 
1996 #if  defined NXSRC
1997   /* all msgs will be of the same length */
1998   len = n*INT_LEN;
1999 
2000   /* implement the mesh fan in/out exchange algorithm */
2001   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
2002     {
2003       dest = my_id^mask;
2004       if (my_id > dest)
2005 	{csend(MSGTAG2+my_id,(char *)vals,len,dest,0); break;}
2006       else
2007 	{crecv(MSGTAG2+dest,(char *)work,len); (*fp)(vals, work, n, oprs);}
2008     }
2009 
2010   /* for (mask=1, edge=1; edge<dim; edge++,mask<<=1) {;} */
2011   if (edge==dim)
2012     {mask>>=1;}
2013   else
2014     {while (++edge<dim) {mask<<=1;}}
2015 
2016   for (edge=0; edge<dim; edge++,mask>>=1)
2017     {
2018       if (my_id%mask)
2019 	{continue;}
2020 
2021       dest = my_id^mask;
2022       if (my_id < dest)
2023 	{csend(MSGTAG4+my_id,(char *)vals,len,dest,0);}
2024       else
2025 	{crecv(MSGTAG4+dest,(char *)vals,len);}
2026     }
2027 
2028 #elif defined MPISRC
2029   for (mask=1,edge=0; edge<dim; edge++,mask<<=1)
2030     {
2031       dest = my_id^mask;
2032       if (my_id > dest)
2033 	{MPI_Send(vals,n,INT_TYPE,dest,MSGTAG2+my_id,MPI_COMM_WORLD);}
2034       else
2035 	{
2036 	  MPI_Recv(work,n,INT_TYPE,MPI_ANY_SOURCE,MSGTAG2+dest,MPI_COMM_WORLD,
2037 		   &status);
2038 	  (*fp)(vals, work, n, oprs);
2039 	}
2040     }
2041 
2042   if (edge==dim)
2043     {mask>>=1;}
2044   else
2045     {while (++edge<dim) {mask<<=1;}}
2046 
2047   for (edge=0; edge<dim; edge++,mask>>=1)
2048     {
2049       if (my_id%mask)
2050 	{continue;}
2051 
2052       dest = my_id^mask;
2053       if (my_id < dest)
2054 	{MPI_Send(vals,n,INT_TYPE,dest,MSGTAG4+my_id,MPI_COMM_WORLD);}
2055       else
2056 	{
2057 	  MPI_Recv(vals,n,INT_TYPE,MPI_ANY_SOURCE,MSGTAG4+dest,MPI_COMM_WORLD,
2058 		   &status);
2059 	}
2060     }
2061 #else
2062   return;
2063 #endif
2064 }
2065