xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision 1f0e42cf17e8fdc56355134e13a5edcdb6dc1c19)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 
30 
31 /***********************************gs.c***************************************
32 Type: struct gather_scatter_id
33 ------------------------------
34 
35 ************************************gs.c**************************************/
36 typedef struct gather_scatter_id {
37   PetscInt id;
38   PetscInt nel_min;
39   PetscInt nel_max;
40   PetscInt nel_sum;
41   PetscInt negl;
42   PetscInt gl_max;
43   PetscInt gl_min;
44   PetscInt repeats;
45   PetscInt ordered;
46   PetscInt positive;
47   PetscScalar *vals;
48 
49   /* bit mask info */
50   PetscInt *my_proc_mask;
51   PetscInt mask_sz;
52   PetscInt *ngh_buf;
53   PetscInt ngh_buf_sz;
54   PetscInt *nghs;
55   PetscInt num_nghs;
56   PetscInt max_nghs;
57   PetscInt *pw_nghs;
58   PetscInt num_pw_nghs;
59   PetscInt *tree_nghs;
60   PetscInt num_tree_nghs;
61 
62   PetscInt num_loads;
63 
64   /* repeats == true -> local info */
65   PetscInt nel;         /* number of unique elememts */
66   PetscInt *elms;       /* of size nel */
67   PetscInt nel_total;
68   PetscInt *local_elms; /* of size nel_total */
69   PetscInt *companion;  /* of size nel_total */
70 
71   /* local info */
72   PetscInt num_local_total;
73   PetscInt local_strength;
74   PetscInt num_local;
75   PetscInt *num_local_reduce;
76   PetscInt **local_reduce;
77   PetscInt num_local_gop;
78   PetscInt *num_gop_local_reduce;
79   PetscInt **gop_local_reduce;
80 
81   /* pairwise info */
82   PetscInt level;
83   PetscInt num_pairs;
84   PetscInt max_pairs;
85   PetscInt loc_node_pairs;
86   PetscInt max_node_pairs;
87   PetscInt min_node_pairs;
88   PetscInt avg_node_pairs;
89   PetscInt *pair_list;
90   PetscInt *msg_sizes;
91   PetscInt **node_list;
92   PetscInt len_pw_list;
93   PetscInt *pw_elm_list;
94   PetscScalar *pw_vals;
95 
96   MPI_Request *msg_ids_in;
97   MPI_Request *msg_ids_out;
98 
99   PetscScalar *out;
100   PetscScalar *in;
101   PetscInt msg_total;
102 
103   /* tree - crystal accumulator info */
104   PetscInt max_left_over;
105   PetscInt *pre;
106   PetscInt *in_num;
107   PetscInt *out_num;
108   PetscInt **in_list;
109   PetscInt **out_list;
110 
111   /* new tree work*/
112   PetscInt  tree_nel;
113   PetscInt *tree_elms;
114   PetscScalar *tree_buf;
115   PetscScalar *tree_work;
116 
117   PetscInt  tree_map_sz;
118   PetscInt *tree_map_in;
119   PetscInt *tree_map_out;
120 
121   /* current memory status */
122   PetscInt gl_bss_min;
123   PetscInt gl_perm_min;
124 
125   /* max segment size for PCTFS_gs_gop_vec() */
126   PetscInt vec_sz;
127 
128   /* hack to make paul happy */
129   MPI_Comm PCTFS_gs_comm;
130 
131 } PCTFS_gs_id;
132 
133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137 static PCTFS_gs_id * gsi_new(void);
138 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
139 
140 /* same for all but vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142 /* vector flavor */
143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
144 
145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
150 
151 
152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
154 
155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
158 
159 /* global vars */
160 /* from comm.c module */
161 
162 static PetscInt num_gs_ids = 0;
163 
164 /* should make this dynamic ... later */
165 static PetscInt msg_buf=MAX_MSG_BUF;
166 static PetscInt vec_sz=GS_VEC_SZ;
167 static PetscInt *tree_buf=NULL;
168 static PetscInt tree_buf_sz=0;
169 static PetscInt ntree=0;
170 
171 /***************************************************************************/
172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173 {
174   PetscFunctionBegin;
175   vec_sz = size;
176   PetscFunctionReturn(0);
177 }
178 
179 /******************************************************************************/
180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181 {
182   PetscFunctionBegin;
183   msg_buf = buf_size;
184   PetscFunctionReturn(0);
185 }
186 
187 /******************************************************************************/
188 PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level)
189 {
190    PCTFS_gs_id *gs;
191   MPI_Group PCTFS_gs_group;
192   MPI_Comm  PCTFS_gs_comm;
193   PetscErrorCode ierr;
194 
195   PetscFunctionBegin;
196   /* ensure that communication package has been initialized */
197   PCTFS_comm_init();
198 
199 
200   /* determines if we have enough dynamic/semi-static memory */
201   /* checks input, allocs and sets gd_id template            */
202   gs = gsi_check_args(elms,nel,level);
203 
204   /* only bit mask version up and working for the moment    */
205   /* LATER :: get int list version working for sparse pblms */
206   ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
207 
208 
209   ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210   ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211   gs->PCTFS_gs_comm=PCTFS_gs_comm;
212 
213   return(gs);
214 }
215 
216 /******************************************************************************/
217 static PCTFS_gs_id *gsi_new(void)
218 {
219   PetscErrorCode ierr;
220   PCTFS_gs_id *gs;
221   gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id));
222   ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
223   return(gs);
224 }
225 
226 /******************************************************************************/
227 static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
228 {
229    PetscInt i, j, k, t2;
230   PetscInt *companion, *elms, *unique, *iptr;
231   PetscInt num_local=0, *num_to_reduce, **local_reduce;
232   PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
233   PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
234   PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
235   PCTFS_gs_id *gs;
236   PetscErrorCode ierr;
237 
238 
239   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
240   if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
241 
242   if (nel==0)
243     {ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
244 
245   /* get space for gs template */
246   gs = gsi_new();
247   gs->id = ++num_gs_ids;
248 
249   /* hmt 6.4.99                                            */
250   /* caller can set global ids that don't participate to 0 */
251   /* PCTFS_gs_init ignores all zeros in elm list                 */
252   /* negative global ids are still invalid                 */
253   for (i=j=0;i<nel;i++)
254     {if (in_elms[i]!=0) {j++;}}
255 
256   k=nel; nel=j;
257 
258   /* copy over in_elms list and create inverse map */
259   elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
260   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
261 
262   for (i=j=0;i<k;i++)
263     {
264       if (in_elms[i]!=0)
265         {elms[j] = in_elms[i]; companion[j++] = i;}
266     }
267 
268   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
269 
270   /* pre-pass ... check to see if sorted */
271   elms[nel] = INT_MAX;
272   iptr = elms;
273   unique = elms+1;
274   j=0;
275   while (*iptr!=INT_MAX)
276     {
277       if (*iptr++>*unique++)
278         {j=1; break;}
279     }
280 
281   /* set up inverse map */
282   if (j)
283     {
284       ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
285       ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
286     }
287   else
288     {ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);}
289   elms[nel] = INT_MIN;
290 
291   /* first pass */
292   /* determine number of unique elements, check pd */
293   for (i=k=0;i<nel;i+=j)
294     {
295       t2 = elms[i];
296       j=++i;
297 
298       /* clump 'em for now */
299       while (elms[j]==t2) {j++;}
300 
301       /* how many together and num local */
302       if (j-=i)
303         {num_local++; k+=j;}
304     }
305 
306   /* how many unique elements? */
307   gs->repeats=k;
308   gs->nel = nel-k;
309 
310 
311   /* number of repeats? */
312   gs->num_local = num_local;
313   num_local+=2;
314   gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*));
315   gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
316 
317   unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
318   gs->elms = unique;
319   gs->nel_total = nel;
320   gs->local_elms = elms;
321   gs->companion = companion;
322 
323   /* compess map as well as keep track of local ops */
324   for (num_local=i=j=0;i<gs->nel;i++)
325     {
326       k=j;
327       t2 = unique[i] = elms[j];
328       companion[i] = companion[j];
329 
330       while (elms[j]==t2) {j++;}
331 
332       if ((t2=(j-k))>1)
333         {
334           /* number together */
335           num_to_reduce[num_local] = t2++;
336           iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
337 
338           /* to use binary searching don't remap until we check intersection */
339           *iptr++ = i;
340 
341           /* note that we're skipping the first one */
342           while (++k<j)
343             {*(iptr++) = companion[k];}
344           *iptr = -1;
345         }
346     }
347 
348   /* sentinel for ngh_buf */
349   unique[gs->nel]=INT_MAX;
350 
351   /* for two partition sort hack */
352   num_to_reduce[num_local] = 0;
353   local_reduce[num_local] = NULL;
354   num_to_reduce[++num_local] = 0;
355   local_reduce[num_local] = NULL;
356 
357   /* load 'em up */
358   /* note one extra to hold NON_UNIFORM flag!!! */
359   vals[2] = vals[1] = vals[0] = nel;
360   if (gs->nel>0)
361     {
362        vals[3] = unique[0];
363        vals[4] = unique[gs->nel-1];
364     }
365   else
366     {
367        vals[3] = INT_MAX;
368        vals[4] = INT_MIN;
369     }
370   vals[5] = level;
371   vals[6] = num_gs_ids;
372 
373   /* GLOBAL: send 'em out */
374   ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
375 
376   /* must be semi-pos def - only pairwise depends on this */
377   /* LATER - remove this restriction */
378   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
379   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
380 
381   gs->nel_min = vals[0];
382   gs->nel_max = vals[1];
383   gs->nel_sum = vals[2];
384   gs->gl_min  = vals[3];
385   gs->gl_max  = vals[4];
386   gs->negl    = vals[4]-vals[3]+1;
387 
388   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
389 
390   /* LATER :: add level == -1 -> program selects level */
391   if (vals[5]<0)
392     {vals[5]=0;}
393   else if (vals[5]>PCTFS_num_nodes)
394     {vals[5]=PCTFS_num_nodes;}
395   gs->level = vals[5];
396 
397   return(gs);
398 }
399 
400 /******************************************************************************/
401 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
402 {
403    PetscInt i, nel, *elms;
404   PetscInt t1;
405   PetscInt **reduce;
406   PetscInt *map;
407   PetscErrorCode ierr;
408 
409   PetscFunctionBegin;
410   /* totally local removes ... PCTFS_ct_bits == 0 */
411   get_ngh_buf(gs);
412 
413   if (gs->level) set_pairwise(gs);
414   if (gs->max_left_over) set_tree(gs);
415 
416   /* intersection local and pairwise/tree? */
417   gs->num_local_total = gs->num_local;
418   gs->gop_local_reduce = gs->local_reduce;
419   gs->num_gop_local_reduce = gs->num_local_reduce;
420 
421   map = gs->companion;
422 
423   /* is there any local compression */
424   if (!gs->num_local) {
425     gs->local_strength = NONE;
426     gs->num_local_gop = 0;
427   } else {
428       /* ok find intersection */
429       map = gs->companion;
430       reduce = gs->local_reduce;
431       for (i=0, t1=0; i<gs->num_local; i++, reduce++)
432         {
433           if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
434               ||
435               PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0)
436             {
437               t1++;
438               if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
439               gs->num_local_reduce[i] *= -1;
440             }
441            **reduce=map[**reduce];
442         }
443 
444       /* intersection is empty */
445       if (!t1)
446         {
447           gs->local_strength = FULL;
448           gs->num_local_gop = 0;
449         }
450       /* intersection not empty */
451       else
452         {
453           gs->local_strength = PARTIAL;
454           ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr);
455 
456           gs->num_local_gop = t1;
457           gs->num_local_total =  gs->num_local;
458           gs->num_local    -= t1;
459           gs->gop_local_reduce = gs->local_reduce;
460           gs->num_gop_local_reduce = gs->num_local_reduce;
461 
462           for (i=0; i<t1; i++)
463             {
464               if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
465               gs->num_gop_local_reduce[i] *= -1;
466               gs->local_reduce++;
467               gs->num_local_reduce++;
468             }
469           gs->local_reduce++;
470           gs->num_local_reduce++;
471         }
472     }
473 
474   elms = gs->pw_elm_list;
475   nel  = gs->len_pw_list;
476   for (i=0; i<nel; i++)
477     {elms[i] = map[elms[i]];}
478 
479   elms = gs->tree_map_in;
480   nel  = gs->tree_map_sz;
481   for (i=0; i<nel; i++)
482     {elms[i] = map[elms[i]];}
483 
484   /* clean up */
485   free((void*) gs->local_elms);
486   free((void*) gs->companion);
487   free((void*) gs->elms);
488   free((void*) gs->ngh_buf);
489   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
490   PetscFunctionReturn(0);
491 }
492 
493 /******************************************************************************/
494 static PetscErrorCode place_in_tree( PetscInt elm)
495 {
496    PetscInt *tp, n;
497 
498   PetscFunctionBegin;
499   if (ntree==tree_buf_sz)
500     {
501       if (tree_buf_sz)
502         {
503           tp = tree_buf;
504           n = tree_buf_sz;
505           tree_buf_sz<<=1;
506           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
507           PCTFS_ivec_copy(tree_buf,tp,n);
508           free(tp);
509         }
510       else
511         {
512           tree_buf_sz = TREE_BUF_SZ;
513           tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
514         }
515     }
516 
517   tree_buf[ntree++] = elm;
518   PetscFunctionReturn(0);
519 }
520 
521 /******************************************************************************/
522 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
523 {
524    PetscInt i, j, npw=0, ntree_map=0;
525   PetscInt p_mask_size, ngh_buf_size, buf_size;
526   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
527   PetscInt *ngh_buf, *buf1, *buf2;
528   PetscInt offset, per_load, num_loads, or_ct, start, end;
529   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
530   PetscInt oper=GL_B_OR;
531   PetscInt *ptr3, *t_mask, level, ct1, ct2;
532   PetscErrorCode ierr;
533 
534   PetscFunctionBegin;
535   /* to make life easier */
536   nel   = gs->nel;
537   elms  = gs->elms;
538   level = gs->level;
539 
540   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
541   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
542   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
543 
544   /* allocate space for masks and info bufs */
545   gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
546   gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
547   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
548   t_mask = (PetscInt*) malloc(p_mask_size);
549   gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
550 
551   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
552   /* had thought I could exploit rendezvous threshold */
553 
554   /* default is one pass */
555   per_load = negl  = gs->negl;
556   gs->num_loads = num_loads = 1;
557   i=p_mask_size*negl;
558 
559   /* possible overflow on buffer size */
560   /* overflow hack                    */
561   if (i<0) {i=INT_MAX;}
562 
563   buf_size = PetscMin(msg_buf,i);
564 
565   /* can we do it? */
566   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
567 
568   /* get PCTFS_giop buf space ... make *only* one malloc */
569   buf1 = (PetscInt*) malloc(buf_size<<1);
570 
571   /* more than one gior exchange needed? */
572   if (buf_size!=i)
573     {
574       per_load = buf_size/p_mask_size;
575       buf_size = per_load*p_mask_size;
576       gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
577     }
578 
579 
580   /* convert buf sizes from #bytes to #ints - 32 bit only! */
581   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
582 
583   /* find PCTFS_giop work space */
584   buf2 = buf1+buf_size;
585 
586   /* hold #ints needed for processor masks */
587   gs->mask_sz=p_mask_size;
588 
589   /* init buffers */
590   ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr);
591   ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr);
592   ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr);
593 
594   /* HACK reset tree info */
595   tree_buf=NULL;
596   tree_buf_sz=ntree=0;
597 
598   /* ok do it */
599   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++)
600     {
601       /* identity for bitwise or is 000...000 */
602       PCTFS_ivec_zero(buf1,buf_size);
603 
604       /* load msg buffer */
605       for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++)
606         {
607           offset = (offset-start)*p_mask_size;
608           PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
609         }
610 
611       /* GLOBAL: pass buffer */
612       ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr);
613 
614 
615       /* unload buffer into ngh_buf */
616       ptr2=(elms+i_start);
617       for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++)
618         {
619           /* I own it ... may have to pairwise it */
620           if (j==*ptr2)
621             {
622               /* do i share it w/anyone? */
623               ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
624               /* guess not */
625               if (ct1<2)
626                 {ptr2++; ptr1+=p_mask_size; continue;}
627 
628               /* i do ... so keep info and turn off my bit */
629               PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
630               ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr);
631               ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
632 
633               /* is it to be done pairwise? */
634               if (--ct1<=level)
635                 {
636                   npw++;
637 
638                   /* turn on high bit to indicate pw need to process */
639                   *ptr2++ |= TOP_BIT;
640                   ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
641                   ptr1+=p_mask_size;
642                   continue;
643                 }
644 
645               /* get set for next and note that I have a tree contribution */
646               /* could save exact elm index for tree here -> save a search */
647               ptr2++; ptr1+=p_mask_size; ntree_map++;
648             }
649           /* i don't but still might be involved in tree */
650           else
651             {
652 
653               /* shared by how many? */
654               ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt));
655 
656               /* none! */
657               if (ct1<2) continue;
658 
659               /* is it going to be done pairwise? but not by me of course!*/
660               if (--ct1<=level) continue;
661             }
662           /* LATER we're going to have to process it NOW */
663           /* nope ... tree it */
664           ierr = place_in_tree(j);CHKERRQ(ierr);
665         }
666     }
667 
668   free((void*)t_mask);
669   free((void*)buf1);
670 
671   gs->len_pw_list=npw;
672   gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
673 
674   /* expand from bit mask list to int list and save ngh list */
675   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
676   PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
677 
678   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
679 
680   oper = GL_MAX;
681   ct1 = gs->num_nghs;
682   ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr);
683   gs->max_nghs = ct1;
684 
685   gs->tree_map_sz  = ntree_map;
686   gs->max_left_over=ntree;
687 
688   free((void*)p_mask);
689   free((void*)sh_proc_mask);
690   PetscFunctionReturn(0);
691 }
692 
693 /******************************************************************************/
694 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
695 {
696    PetscInt i, j;
697   PetscInt p_mask_size;
698   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
699   PetscInt *ngh_buf, *buf2;
700   PetscInt offset;
701   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
702   PetscInt *pairwise_elm_list, len_pair_list=0;
703   PetscInt *iptr, t1, i_start, nel, *elms;
704   PetscInt ct;
705   PetscErrorCode ierr;
706 
707   PetscFunctionBegin;
708   /* to make life easier */
709   nel  = gs->nel;
710   elms = gs->elms;
711   ngh_buf = gs->ngh_buf;
712   sh_proc_mask  = gs->pw_nghs;
713 
714   /* need a few temp masks */
715   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
716   p_mask        = (PetscInt*) malloc(p_mask_size);
717   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
718 
719   /* set mask to my PCTFS_my_id's bit mask */
720   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
721 
722   p_mask_size /= sizeof(PetscInt);
723 
724   len_pair_list=gs->len_pw_list;
725   gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
726 
727   /* how many processors (nghs) do we have to exchange with? */
728   nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt));
729 
730 
731   /* allocate space for PCTFS_gs_gop() info */
732   gs->pair_list = msg_list = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
733   gs->msg_sizes = msg_size  = (PetscInt *)  malloc(sizeof(PetscInt)*nprs);
734   gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1));
735 
736   /* init msg_size list */
737   ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr);
738 
739   /* expand from bit mask list to int list */
740   ierr = PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr);
741 
742   /* keep list of elements being handled pairwise */
743   for (i=j=0;i<nel;i++)
744     {
745       if (elms[i] & TOP_BIT)
746         {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;}
747     }
748   pairwise_elm_list[j] = -1;
749 
750   gs->msg_ids_out = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
751   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
752   gs->msg_ids_in = (MPI_Request *)  malloc(sizeof(MPI_Request)*(nprs+1));
753   gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
754   gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
755 
756   /* find who goes to each processor */
757   for (i_start=i=0;i<nprs;i++)
758     {
759       /* processor i's mask */
760       ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr);
761 
762       /* det # going to processor i */
763       for (ct=j=0;j<len_pair_list;j++)
764         {
765           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
766           ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
767           if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
768             {ct++;}
769         }
770       msg_size[i] = ct;
771       i_start = PetscMax(i_start,ct);
772 
773       /*space to hold nodes in message to first neighbor */
774       msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
775 
776       for (j=0;j<len_pair_list;j++)
777         {
778           buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
779           ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
780           if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt)))
781             {*iptr++ = j;}
782         }
783       *iptr = -1;
784     }
785   msg_nodes[nprs] = NULL;
786 
787   j=gs->loc_node_pairs=i_start;
788   t1 = GL_MAX;
789   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
790   gs->max_node_pairs = i_start;
791 
792   i_start=j;
793   t1 = GL_MIN;
794   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
795   gs->min_node_pairs = i_start;
796 
797   i_start=j;
798   t1 = GL_ADD;
799   ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
800   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
801 
802   i_start=nprs;
803   t1 = GL_MAX;
804   PCTFS_giop(&i_start,&offset,1,&t1);
805   gs->max_pairs = i_start;
806 
807 
808   /* remap pairwise in tail of gsi_via_bit_mask() */
809   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
810   gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
811   gs->in  = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
812 
813   /* reset malloc pool */
814   free((void*)p_mask);
815   free((void*)tmp_proc_mask);
816   PetscFunctionReturn(0);
817 }
818 
819 /* to do pruned tree just save ngh buf copy for each one and decode here!
820 ******************************************************************************/
821 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
822 {
823   PetscInt i, j, n, nel;
824   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
825 
826   PetscFunctionBegin;
827   /* local work ptrs */
828   elms = gs->elms;
829   nel     = gs->nel;
830 
831   /* how many via tree */
832   gs->tree_nel  = n = ntree;
833   gs->tree_elms = tree_elms = iptr_in = tree_buf;
834   gs->tree_buf  = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
835   gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz);
836   j=gs->tree_map_sz;
837   gs->tree_map_in = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
838   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
839 
840   /* search the longer of the two lists */
841   /* note ... could save this info in get_ngh_buf and save searches */
842   if (n<=nel)
843     {
844       /* bijective fct w/remap - search elm list */
845       for (i=0; i<n; i++)
846         {
847           if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0)
848             {*iptr_in++ = j; *iptr_out++ = i;}
849         }
850     }
851   else
852     {
853       for (i=0; i<nel; i++)
854         {
855           if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0)
856             {*iptr_in++ = i; *iptr_out++ = j;}
857         }
858     }
859 
860   /* sentinel */
861   *iptr_in = *iptr_out = -1;
862   PetscFunctionReturn(0);
863 }
864 
865 /******************************************************************************/
866 static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs,  PetscScalar *vals)
867 {
868   PetscInt *num, *map, **reduce;
869   PetscScalar tmp;
870 
871   PetscFunctionBegin;
872   num    = gs->num_gop_local_reduce;
873   reduce = gs->gop_local_reduce;
874   while ((map = *reduce++))
875     {
876       /* wall */
877       if (*num == 2)
878         {
879           num ++;
880           vals[map[1]] = vals[map[0]];
881         }
882       /* corner shared by three elements */
883       else if (*num == 3)
884         {
885           num ++;
886           vals[map[2]] = vals[map[1]] = vals[map[0]];
887         }
888       /* corner shared by four elements */
889       else if (*num == 4)
890         {
891           num ++;
892           vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
893         }
894       /* general case ... odd geoms ... 3D*/
895       else
896         {
897           num++;
898           tmp = *(vals + *map++);
899           while (*map >= 0)
900             {*(vals + *map++) = tmp;}
901         }
902     }
903   PetscFunctionReturn(0);
904 }
905 
906 /******************************************************************************/
907 static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
908 {
909    PetscInt *num, *map, **reduce;
910    PetscScalar tmp;
911 
912   PetscFunctionBegin;
913   num    = gs->num_local_reduce;
914   reduce = gs->local_reduce;
915   while ((map = *reduce))
916     {
917       /* wall */
918       if (*num == 2)
919         {
920           num ++; reduce++;
921           vals[map[1]] = vals[map[0]] += vals[map[1]];
922         }
923       /* corner shared by three elements */
924       else if (*num == 3)
925         {
926           num ++; reduce++;
927           vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
928         }
929       /* corner shared by four elements */
930       else if (*num == 4)
931         {
932           num ++; reduce++;
933           vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] +=
934                                  (vals[map[1]] + vals[map[2]] + vals[map[3]]);
935         }
936       /* general case ... odd geoms ... 3D*/
937       else
938         {
939           num ++;
940           tmp = 0.0;
941           while (*map >= 0)
942             {tmp += *(vals + *map++);}
943 
944           map = *reduce++;
945           while (*map >= 0)
946             {*(vals + *map++) = tmp;}
947         }
948     }
949   PetscFunctionReturn(0);
950 }
951 
952 /******************************************************************************/
953 static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals)
954 {
955    PetscInt *num, *map, **reduce;
956    PetscScalar *base;
957 
958   PetscFunctionBegin;
959   num    = gs->num_gop_local_reduce;
960   reduce = gs->gop_local_reduce;
961   while ((map = *reduce++))
962     {
963       /* wall */
964       if (*num == 2)
965         {
966           num ++;
967           vals[map[0]] += vals[map[1]];
968         }
969       /* corner shared by three elements */
970       else if (*num == 3)
971         {
972           num ++;
973           vals[map[0]] += (vals[map[1]] + vals[map[2]]);
974         }
975       /* corner shared by four elements */
976       else if (*num == 4)
977         {
978           num ++;
979           vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
980         }
981       /* general case ... odd geoms ... 3D*/
982       else
983         {
984           num++;
985           base = vals + *map++;
986           while (*map >= 0)
987             {*base += *(vals + *map++);}
988         }
989     }
990   PetscFunctionReturn(0);
991 }
992 
993 /******************************************************************************/
994 PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs)
995 {
996    PetscInt i;
997 
998   PetscFunctionBegin;
999   if (gs->nghs) {free((void*) gs->nghs);}
1000   if (gs->pw_nghs) {free((void*) gs->pw_nghs);}
1001 
1002   /* tree */
1003   if (gs->max_left_over)
1004     {
1005       if (gs->tree_elms) {free((void*) gs->tree_elms);}
1006       if (gs->tree_buf) {free((void*) gs->tree_buf);}
1007       if (gs->tree_work) {free((void*) gs->tree_work);}
1008       if (gs->tree_map_in) {free((void*) gs->tree_map_in);}
1009       if (gs->tree_map_out) {free((void*) gs->tree_map_out);}
1010     }
1011 
1012   /* pairwise info */
1013   if (gs->num_pairs)
1014     {
1015       /* should be NULL already */
1016       if (gs->ngh_buf) {free((void*) gs->ngh_buf);}
1017       if (gs->elms) {free((void*) gs->elms);}
1018       if (gs->local_elms) {free((void*) gs->local_elms);}
1019       if (gs->companion) {free((void*) gs->companion);}
1020 
1021       /* only set if pairwise */
1022       if (gs->vals) {free((void*) gs->vals);}
1023       if (gs->in) {free((void*) gs->in);}
1024       if (gs->out) {free((void*) gs->out);}
1025       if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);}
1026       if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);}
1027       if (gs->pw_vals) {free((void*) gs->pw_vals);}
1028       if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);}
1029       if (gs->node_list)
1030         {
1031           for (i=0;i<gs->num_pairs;i++)
1032             {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}}
1033           free((void*) gs->node_list);
1034         }
1035       if (gs->msg_sizes) {free((void*) gs->msg_sizes);}
1036       if (gs->pair_list) {free((void*) gs->pair_list);}
1037     }
1038 
1039   /* local info */
1040   if (gs->num_local_total>=0)
1041     {
1042       for (i=0;i<gs->num_local_total+1;i++)
1043         /*      for (i=0;i<gs->num_local_total;i++) */
1044         {
1045           if (gs->num_gop_local_reduce[i])
1046             {free((void*) gs->gop_local_reduce[i]);}
1047         }
1048     }
1049 
1050   /* if intersection tree/pairwise and local isn't empty */
1051   if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);}
1052   if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);}
1053 
1054   free((void*) gs);
1055   PetscFunctionReturn(0);
1056 }
1057 
1058 /******************************************************************************/
1059 PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
1060 {
1061   PetscErrorCode ierr;
1062 
1063   PetscFunctionBegin;
1064   switch (*op) {
1065   case '+':
1066     PCTFS_gs_gop_vec_plus(gs,vals,step);
1067     break;
1068   default:
1069     ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);CHKERRQ(ierr);
1070     ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");CHKERRQ(ierr);
1071     PCTFS_gs_gop_vec_plus(gs,vals,step);
1072     break;
1073   }
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 /******************************************************************************/
1078 static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1079 {
1080   PetscFunctionBegin;
1081   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
1082 
1083   /* local only operations!!! */
1084   if (gs->num_local)
1085     {PCTFS_gs_gop_vec_local_plus(gs,vals,step);}
1086 
1087   /* if intersection tree/pairwise and local isn't empty */
1088   if (gs->num_local_gop)
1089     {
1090       PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
1091 
1092       /* pairwise */
1093       if (gs->num_pairs)
1094         {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}
1095 
1096       /* tree */
1097       else if (gs->max_left_over)
1098         {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}
1099 
1100       PCTFS_gs_gop_vec_local_out(gs,vals,step);
1101     }
1102   /* if intersection tree/pairwise and local is empty */
1103   else
1104     {
1105       /* pairwise */
1106       if (gs->num_pairs)
1107         {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);}
1108 
1109       /* tree */
1110       else if (gs->max_left_over)
1111         {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);}
1112     }
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 /******************************************************************************/
1117 static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1118 {
1119    PetscInt *num, *map, **reduce;
1120    PetscScalar *base;
1121 
1122   PetscFunctionBegin;
1123   num    = gs->num_local_reduce;
1124   reduce = gs->local_reduce;
1125   while ((map = *reduce))
1126     {
1127       base = vals + map[0] * step;
1128 
1129       /* wall */
1130       if (*num == 2)
1131         {
1132           num++; reduce++;
1133           PCTFS_rvec_add (base,vals+map[1]*step,step);
1134           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1135         }
1136       /* corner shared by three elements */
1137       else if (*num == 3)
1138         {
1139           num++; reduce++;
1140           PCTFS_rvec_add (base,vals+map[1]*step,step);
1141           PCTFS_rvec_add (base,vals+map[2]*step,step);
1142           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1143           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1144         }
1145       /* corner shared by four elements */
1146       else if (*num == 4)
1147         {
1148           num++; reduce++;
1149           PCTFS_rvec_add (base,vals+map[1]*step,step);
1150           PCTFS_rvec_add (base,vals+map[2]*step,step);
1151           PCTFS_rvec_add (base,vals+map[3]*step,step);
1152           PCTFS_rvec_copy(vals+map[3]*step,base,step);
1153           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1154           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1155         }
1156       /* general case ... odd geoms ... 3D */
1157       else
1158         {
1159           num++;
1160           while (*++map >= 0)
1161             {PCTFS_rvec_add (base,vals+*map*step,step);}
1162 
1163           map = *reduce;
1164           while (*++map >= 0)
1165             {PCTFS_rvec_copy(vals+*map*step,base,step);}
1166 
1167           reduce++;
1168         }
1169     }
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 /******************************************************************************/
1174 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1175 {
1176    PetscInt  *num, *map, **reduce;
1177    PetscScalar *base;
1178   PetscFunctionBegin;
1179   num    = gs->num_gop_local_reduce;
1180   reduce = gs->gop_local_reduce;
1181   while ((map = *reduce++))
1182     {
1183       base = vals + map[0] * step;
1184 
1185       /* wall */
1186       if (*num == 2)
1187         {
1188           num ++;
1189           PCTFS_rvec_add(base,vals+map[1]*step,step);
1190         }
1191       /* corner shared by three elements */
1192       else if (*num == 3)
1193         {
1194           num ++;
1195           PCTFS_rvec_add(base,vals+map[1]*step,step);
1196           PCTFS_rvec_add(base,vals+map[2]*step,step);
1197         }
1198       /* corner shared by four elements */
1199       else if (*num == 4)
1200         {
1201           num ++;
1202           PCTFS_rvec_add(base,vals+map[1]*step,step);
1203           PCTFS_rvec_add(base,vals+map[2]*step,step);
1204           PCTFS_rvec_add(base,vals+map[3]*step,step);
1205         }
1206       /* general case ... odd geoms ... 3D*/
1207       else
1208         {
1209           num++;
1210           while (*++map >= 0)
1211             {PCTFS_rvec_add(base,vals+*map*step,step);}
1212         }
1213     }
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 /******************************************************************************/
1218 static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1219 {
1220    PetscInt *num, *map, **reduce;
1221    PetscScalar *base;
1222 
1223   PetscFunctionBegin;
1224   num    = gs->num_gop_local_reduce;
1225   reduce = gs->gop_local_reduce;
1226   while ((map = *reduce++))
1227     {
1228       base = vals + map[0] * step;
1229 
1230       /* wall */
1231       if (*num == 2)
1232         {
1233           num ++;
1234           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1235         }
1236       /* corner shared by three elements */
1237       else if (*num == 3)
1238         {
1239           num ++;
1240           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1241           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1242         }
1243       /* corner shared by four elements */
1244       else if (*num == 4)
1245         {
1246           num ++;
1247           PCTFS_rvec_copy(vals+map[1]*step,base,step);
1248           PCTFS_rvec_copy(vals+map[2]*step,base,step);
1249           PCTFS_rvec_copy(vals+map[3]*step,base,step);
1250         }
1251       /* general case ... odd geoms ... 3D*/
1252       else
1253         {
1254           num++;
1255           while (*++map >= 0)
1256             {PCTFS_rvec_copy(vals+*map*step,base,step);}
1257         }
1258     }
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /******************************************************************************/
1263 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1264 {
1265   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1266   PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1267   PetscInt *pw, *list, *size, **nodes;
1268   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1269   MPI_Status status;
1270   PetscBLASInt i1 = 1,dstep;
1271   PetscErrorCode ierr;
1272 
1273   PetscFunctionBegin;
1274   /* strip and load s */
1275   msg_list =list         = gs->pair_list;
1276   msg_size =size         = gs->msg_sizes;
1277   msg_nodes=nodes        = gs->node_list;
1278   iptr=pw                = gs->pw_elm_list;
1279   dptr1=dptr3            = gs->pw_vals;
1280   msg_ids_in  = ids_in   = gs->msg_ids_in;
1281   msg_ids_out = ids_out  = gs->msg_ids_out;
1282   dptr2                  = gs->out;
1283   in1=in2                = gs->in;
1284 
1285   /* post the receives */
1286   /*  msg_nodes=nodes; */
1287   do
1288     {
1289       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1290          second one *list and do list++ afterwards */
1291       ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1292       list++;msg_ids_in++;
1293       in1 += *size++ *step;
1294     }
1295   while (*++msg_nodes);
1296   msg_nodes=nodes;
1297 
1298   /* load gs values into in out gs buffers */
1299   while (*iptr >= 0)
1300     {
1301       PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1302       dptr3+=step;
1303       iptr++;
1304     }
1305 
1306   /* load out buffers and post the sends */
1307   while ((iptr = *msg_nodes++))
1308     {
1309       dptr3 = dptr2;
1310       while (*iptr >= 0)
1311         {
1312           PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1313           dptr2+=step;
1314           iptr++;
1315         }
1316       ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1317       msg_size++; msg_list++;msg_ids_out++;
1318     }
1319 
1320   /* tree */
1321   if (gs->max_left_over)
1322     {PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);}
1323 
1324   /* process the received data */
1325   msg_nodes=nodes;
1326   while ((iptr = *nodes++)){
1327     PetscScalar d1 = 1.0;
1328       /* Should I check the return value of MPI_Wait() or status? */
1329       /* Can this loop be replaced by a call to MPI_Waitall()? */
1330       ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1331       ids_in++;
1332       while (*iptr >= 0) {
1333 	dstep = PetscBLASIntCast(step);
1334         BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1);
1335 	in2+=step;
1336 	iptr++;
1337       }
1338   }
1339 
1340   /* replace vals */
1341   while (*pw >= 0)
1342     {
1343       PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1344       dptr1+=step;
1345       pw++;
1346     }
1347 
1348   /* clear isend message handles */
1349   /* This changed for clarity though it could be the same */
1350   while (*msg_nodes++)
1351     /* Should I check the return value of MPI_Wait() or status? */
1352     /* Can this loop be replaced by a call to MPI_Waitall()? */
1353     {ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);ids_out++;}
1354 
1355 
1356   PetscFunctionReturn(0);
1357 }
1358 
1359 /******************************************************************************/
1360 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1361 {
1362   PetscInt size, *in, *out;
1363   PetscScalar *buf, *work;
1364   PetscInt op[] = {GL_ADD,0};
1365   PetscBLASInt i1 = 1;
1366 
1367   PetscFunctionBegin;
1368   /* copy over to local variables */
1369   in   = gs->tree_map_in;
1370   out  = gs->tree_map_out;
1371   buf  = gs->tree_buf;
1372   work = gs->tree_work;
1373   size = gs->tree_nel*step;
1374 
1375   /* zero out collection buffer */
1376   PCTFS_rvec_zero(buf,size);
1377 
1378 
1379   /* copy over my contributions */
1380   while (*in >= 0)
1381     {
1382       PetscBLASInt dstep = PetscBLASIntCast(step);
1383       BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1);
1384     }
1385 
1386   /* perform fan in/out on full buffer */
1387   /* must change PCTFS_grop to handle the blas */
1388   PCTFS_grop(buf,work,size,op);
1389 
1390   /* reset */
1391   in   = gs->tree_map_in;
1392   out  = gs->tree_map_out;
1393 
1394   /* get the portion of the results I need */
1395   while (*in >= 0)
1396     {
1397       PetscBLASInt dstep = PetscBLASIntCast(step);
1398       BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1);
1399     }
1400   PetscFunctionReturn(0);
1401 }
1402 
1403 /******************************************************************************/
1404 PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1405 {
1406   PetscErrorCode ierr;
1407 
1408   PetscFunctionBegin;
1409   switch (*op) {
1410   case '+':
1411     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1412     break;
1413   default:
1414     ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);CHKERRQ(ierr);
1415     ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr);
1416     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1417     break;
1418   }
1419   PetscFunctionReturn(0);
1420 }
1421 
1422 /******************************************************************************/
1423 static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1424 {
1425   PetscFunctionBegin;
1426   /* if there's nothing to do return */
1427   if (dim<=0)
1428     {  PetscFunctionReturn(0);}
1429 
1430   /* can't do more dimensions then exist */
1431   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1432 
1433   /* local only operations!!! */
1434   if (gs->num_local)
1435     {PCTFS_gs_gop_local_plus(gs,vals);}
1436 
1437   /* if intersection tree/pairwise and local isn't empty */
1438   if (gs->num_local_gop)
1439     {
1440       PCTFS_gs_gop_local_in_plus(gs,vals);
1441 
1442       /* pairwise will do tree inside ... */
1443       if (gs->num_pairs)
1444         {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}
1445 
1446       /* tree only */
1447       else if (gs->max_left_over)
1448         {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1449 
1450       PCTFS_gs_gop_local_out(gs,vals);
1451     }
1452   /* if intersection tree/pairwise and local is empty */
1453   else
1454     {
1455       /* pairwise will do tree inside */
1456       if (gs->num_pairs)
1457         {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);}
1458 
1459       /* tree */
1460       else if (gs->max_left_over)
1461         {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);}
1462     }
1463   PetscFunctionReturn(0);
1464 }
1465 
1466 /******************************************************************************/
1467 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1468 {
1469    PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1470    PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1471    PetscInt *pw, *list, *size, **nodes;
1472   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1473   MPI_Status status;
1474   PetscInt i, mask=1;
1475   PetscErrorCode ierr;
1476 
1477   PetscFunctionBegin;
1478   for (i=1; i<dim; i++)
1479     {mask<<=1; mask++;}
1480 
1481 
1482   /* strip and load s */
1483   msg_list =list         = gs->pair_list;
1484   msg_size =size         = gs->msg_sizes;
1485   msg_nodes=nodes        = gs->node_list;
1486   iptr=pw                = gs->pw_elm_list;
1487   dptr1=dptr3            = gs->pw_vals;
1488   msg_ids_in  = ids_in   = gs->msg_ids_in;
1489   msg_ids_out = ids_out  = gs->msg_ids_out;
1490   dptr2                  = gs->out;
1491   in1=in2                = gs->in;
1492 
1493   /* post the receives */
1494   /*  msg_nodes=nodes; */
1495   do
1496     {
1497       /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1498          second one *list and do list++ afterwards */
1499       if ((PCTFS_my_id|mask)==(*list|mask))
1500         {
1501           ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1502           list++; msg_ids_in++;in1 += *size++;
1503         }
1504       else
1505         {list++; size++;}
1506     }
1507   while (*++msg_nodes);
1508 
1509   /* load gs values into in out gs buffers */
1510   while (*iptr >= 0)
1511     {*dptr3++ = *(in_vals + *iptr++);}
1512 
1513   /* load out buffers and post the sends */
1514   msg_nodes=nodes;
1515   list = msg_list;
1516   while ((iptr = *msg_nodes++))
1517     {
1518       if ((PCTFS_my_id|mask)==(*list|mask))
1519         {
1520           dptr3 = dptr2;
1521           while (*iptr >= 0)
1522             {*dptr2++ = *(dptr1 + *iptr++);}
1523           /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1524           /* is msg_ids_out++ correct? */
1525           ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1526           msg_size++;list++;msg_ids_out++;
1527         }
1528       else
1529         {list++; msg_size++;}
1530     }
1531 
1532   /* do the tree while we're waiting */
1533   if (gs->max_left_over)
1534     {PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);}
1535 
1536   /* process the received data */
1537   msg_nodes=nodes;
1538   list = msg_list;
1539   while ((iptr = *nodes++))
1540     {
1541       if ((PCTFS_my_id|mask)==(*list|mask))
1542         {
1543           /* Should I check the return value of MPI_Wait() or status? */
1544           /* Can this loop be replaced by a call to MPI_Waitall()? */
1545           ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1546           ids_in++;
1547           while (*iptr >= 0)
1548             {*(dptr1 + *iptr++) += *in2++;}
1549         }
1550       list++;
1551     }
1552 
1553   /* replace vals */
1554   while (*pw >= 0)
1555     {*(in_vals + *pw++) = *dptr1++;}
1556 
1557   /* clear isend message handles */
1558   /* This changed for clarity though it could be the same */
1559   while (*msg_nodes++)
1560     {
1561       if ((PCTFS_my_id|mask)==(*msg_list|mask))
1562         {
1563           /* Should I check the return value of MPI_Wait() or status? */
1564           /* Can this loop be replaced by a call to MPI_Waitall()? */
1565           ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);
1566           ids_out++;
1567         }
1568       msg_list++;
1569     }
1570 
1571   PetscFunctionReturn(0);
1572 }
1573 
1574 /******************************************************************************/
1575 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1576 {
1577   PetscInt size;
1578   PetscInt *in, *out;
1579   PetscScalar *buf, *work;
1580   PetscInt op[] = {GL_ADD,0};
1581 
1582   PetscFunctionBegin;
1583   in   = gs->tree_map_in;
1584   out  = gs->tree_map_out;
1585   buf  = gs->tree_buf;
1586   work = gs->tree_work;
1587   size = gs->tree_nel;
1588 
1589   PCTFS_rvec_zero(buf,size);
1590 
1591   while (*in >= 0)
1592     {*(buf + *out++) = *(vals + *in++);}
1593 
1594   in   = gs->tree_map_in;
1595   out  = gs->tree_map_out;
1596 
1597   PCTFS_grop_hc(buf,work,size,op,dim);
1598 
1599   while (*in >= 0)
1600     {*(vals + *in++) = *(buf + *out++);}
1601   PetscFunctionReturn(0);
1602 }
1603 
1604 
1605 
1606