xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision cbd2c53bc8d4e32f1aab714f173fd0b4c821fa72)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 /***********************************gs.c***************************************
30 Type: struct gather_scatter_id
31 ------------------------------
32 
33 ************************************gs.c**************************************/
34 typedef struct gather_scatter_id {
35   PetscInt    id;
36   PetscInt    nel_min;
37   PetscInt    nel_max;
38   PetscInt    nel_sum;
39   PetscInt    negl;
40   PetscInt    gl_max;
41   PetscInt    gl_min;
42   PetscInt    repeats;
43   PetscInt    ordered;
44   PetscInt    positive;
45   PetscScalar *vals;
46 
47   /* bit mask info */
48   PetscInt *my_proc_mask;
49   PetscInt mask_sz;
50   PetscInt *ngh_buf;
51   PetscInt ngh_buf_sz;
52   PetscInt *nghs;
53   PetscInt num_nghs;
54   PetscInt max_nghs;
55   PetscInt *pw_nghs;
56   PetscInt num_pw_nghs;
57   PetscInt *tree_nghs;
58   PetscInt num_tree_nghs;
59 
60   PetscInt num_loads;
61 
62   /* repeats == true -> local info */
63   PetscInt nel;         /* number of unique elememts */
64   PetscInt *elms;       /* of size nel */
65   PetscInt nel_total;
66   PetscInt *local_elms; /* of size nel_total */
67   PetscInt *companion;  /* of size nel_total */
68 
69   /* local info */
70   PetscInt num_local_total;
71   PetscInt local_strength;
72   PetscInt num_local;
73   PetscInt *num_local_reduce;
74   PetscInt **local_reduce;
75   PetscInt num_local_gop;
76   PetscInt *num_gop_local_reduce;
77   PetscInt **gop_local_reduce;
78 
79   /* pairwise info */
80   PetscInt    level;
81   PetscInt    num_pairs;
82   PetscInt    max_pairs;
83   PetscInt    loc_node_pairs;
84   PetscInt    max_node_pairs;
85   PetscInt    min_node_pairs;
86   PetscInt    avg_node_pairs;
87   PetscInt    *pair_list;
88   PetscInt    *msg_sizes;
89   PetscInt    **node_list;
90   PetscInt    len_pw_list;
91   PetscInt    *pw_elm_list;
92   PetscScalar *pw_vals;
93 
94   MPI_Request *msg_ids_in;
95   MPI_Request *msg_ids_out;
96 
97   PetscScalar *out;
98   PetscScalar *in;
99   PetscInt    msg_total;
100 
101   /* tree - crystal accumulator info */
102   PetscInt max_left_over;
103   PetscInt *pre;
104   PetscInt *in_num;
105   PetscInt *out_num;
106   PetscInt **in_list;
107   PetscInt **out_list;
108 
109   /* new tree work*/
110   PetscInt    tree_nel;
111   PetscInt    *tree_elms;
112   PetscScalar *tree_buf;
113   PetscScalar *tree_work;
114 
115   PetscInt tree_map_sz;
116   PetscInt *tree_map_in;
117   PetscInt *tree_map_out;
118 
119   /* current memory status */
120   PetscInt gl_bss_min;
121   PetscInt gl_perm_min;
122 
123   /* max segment size for PCTFS_gs_gop_vec() */
124   PetscInt vec_sz;
125 
126   /* hack to make paul happy */
127   MPI_Comm PCTFS_gs_comm;
128 
129 } PCTFS_gs_id;
130 
131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135 static PCTFS_gs_id *gsi_new(void);
136 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
137 
138 /* same for all but vector flavor */
139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140 /* vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 
143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 
149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
151 
152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
155 
156 /* global vars */
157 /* from comm.c module */
158 
159 static PetscInt num_gs_ids = 0;
160 
161 /* should make this dynamic ... later */
162 static PetscInt msg_buf    =MAX_MSG_BUF;
163 static PetscInt vec_sz     =GS_VEC_SZ;
164 static PetscInt *tree_buf  =NULL;
165 static PetscInt tree_buf_sz=0;
166 static PetscInt ntree      =0;
167 
168 /***************************************************************************/
169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170 {
171   PetscFunctionBegin;
172   vec_sz = size;
173   PetscFunctionReturn(0);
174 }
175 
176 /******************************************************************************/
177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178 {
179   PetscFunctionBegin;
180   msg_buf = buf_size;
181   PetscFunctionReturn(0);
182 }
183 
184 /******************************************************************************/
185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186 {
187   PCTFS_gs_id    *gs;
188   MPI_Group      PCTFS_gs_group;
189   MPI_Comm       PCTFS_gs_comm;
190   PetscErrorCode ierr;
191 
192   PetscFunctionBegin;
193   /* ensure that communication package has been initialized */
194   PCTFS_comm_init();
195 
196   /* determines if we have enough dynamic/semi-static memory */
197   /* checks input, allocs and sets gd_id template            */
198   gs = gsi_check_args(elms,nel,level);
199 
200   /* only bit mask version up and working for the moment    */
201   /* LATER :: get int list version working for sparse pblms */
202   ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
203 
204   ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
205   ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
206   ierr = MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
207 
208   gs->PCTFS_gs_comm=PCTFS_gs_comm;
209 
210   return(gs);
211 }
212 
213 /******************************************************************************/
214 static PCTFS_gs_id *gsi_new(void)
215 {
216   PetscErrorCode ierr;
217   PCTFS_gs_id    *gs;
218   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
219   ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
220   return(gs);
221 }
222 
223 /******************************************************************************/
224 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
225 {
226   PetscInt       i, j, k, t2;
227   PetscInt       *companion, *elms, *unique, *iptr;
228   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
229   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
230   PetscInt       vals[sizeof(oprs)/sizeof(oprs[0])-1];
231   PetscInt       work[sizeof(oprs)/sizeof(oprs[0])-1];
232   PCTFS_gs_id    *gs;
233   PetscErrorCode ierr;
234 
235   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
236   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
237 
238   if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
239 
240   /* get space for gs template */
241   gs     = gsi_new();
242   gs->id = ++num_gs_ids;
243 
244   /* hmt 6.4.99                                            */
245   /* caller can set global ids that don't participate to 0 */
246   /* PCTFS_gs_init ignores all zeros in elm list                 */
247   /* negative global ids are still invalid                 */
248   for (i=j=0; i<nel; i++) {
249     if (in_elms[i]!=0) j++;
250   }
251 
252   k=nel; nel=j;
253 
254   /* copy over in_elms list and create inverse map */
255   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
256   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
257 
258   for (i=j=0; i<k; i++) {
259     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
260   }
261 
262   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
263 
264   /* pre-pass ... check to see if sorted */
265   elms[nel] = INT_MAX;
266   iptr      = elms;
267   unique    = elms+1;
268   j         =0;
269   while (*iptr!=INT_MAX) {
270     if (*iptr++>*unique++) { j=1; break; }
271   }
272 
273   /* set up inverse map */
274   if (j) {
275     ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
276     ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
277   } else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
278   elms[nel] = INT_MIN;
279 
280   /* first pass */
281   /* determine number of unique elements, check pd */
282   for (i=k=0; i<nel; i+=j) {
283     t2 = elms[i];
284     j  = ++i;
285 
286     /* clump 'em for now */
287     while (elms[j]==t2) j++;
288 
289     /* how many together and num local */
290     if (j-=i) { num_local++; k+=j; }
291   }
292 
293   /* how many unique elements? */
294   gs->repeats = k;
295   gs->nel     = nel-k;
296 
297   /* number of repeats? */
298   gs->num_local        = num_local;
299   num_local           += 2;
300   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
301   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
302 
303   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
304   gs->elms       = unique;
305   gs->nel_total  = nel;
306   gs->local_elms = elms;
307   gs->companion  = companion;
308 
309   /* compess map as well as keep track of local ops */
310   for (num_local=i=j=0; i<gs->nel; i++) {
311     k            = j;
312     t2           = unique[i] = elms[j];
313     companion[i] = companion[j];
314 
315     while (elms[j]==t2) j++;
316 
317     if ((t2=(j-k))>1) {
318       /* number together */
319       num_to_reduce[num_local] = t2++;
320 
321       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
322 
323       /* to use binary searching don't remap until we check intersection */
324       *iptr++ = i;
325 
326       /* note that we're skipping the first one */
327       while (++k<j) *(iptr++) = companion[k];
328       *iptr = -1;
329     }
330   }
331 
332   /* sentinel for ngh_buf */
333   unique[gs->nel]=INT_MAX;
334 
335   /* for two partition sort hack */
336   num_to_reduce[num_local]   = 0;
337   local_reduce[num_local]    = NULL;
338   num_to_reduce[++num_local] = 0;
339   local_reduce[num_local]    = NULL;
340 
341   /* load 'em up */
342   /* note one extra to hold NON_UNIFORM flag!!! */
343   vals[2] = vals[1] = vals[0] = nel;
344   if (gs->nel>0) {
345     vals[3] = unique[0];
346     vals[4] = unique[gs->nel-1];
347   } else {
348     vals[3] = INT_MAX;
349     vals[4] = INT_MIN;
350   }
351   vals[5] = level;
352   vals[6] = num_gs_ids;
353 
354   /* GLOBAL: send 'em out */
355   ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
356 
357   /* must be semi-pos def - only pairwise depends on this */
358   /* LATER - remove this restriction */
359   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
360   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
361 
362   gs->nel_min = vals[0];
363   gs->nel_max = vals[1];
364   gs->nel_sum = vals[2];
365   gs->gl_min  = vals[3];
366   gs->gl_max  = vals[4];
367   gs->negl    = vals[4]-vals[3]+1;
368 
369   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
370 
371   /* LATER :: add level == -1 -> program selects level */
372   if (vals[5]<0) vals[5]=0;
373   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
374   gs->level = vals[5];
375 
376   return(gs);
377 }
378 
379 /******************************************************************************/
380 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
381 {
382   PetscInt       i, nel, *elms;
383   PetscInt       t1;
384   PetscInt       **reduce;
385   PetscInt       *map;
386   PetscErrorCode ierr;
387 
388   PetscFunctionBegin;
389   /* totally local removes ... PCTFS_ct_bits == 0 */
390   get_ngh_buf(gs);
391 
392   if (gs->level) set_pairwise(gs);
393   if (gs->max_left_over) set_tree(gs);
394 
395   /* intersection local and pairwise/tree? */
396   gs->num_local_total      = gs->num_local;
397   gs->gop_local_reduce     = gs->local_reduce;
398   gs->num_gop_local_reduce = gs->num_local_reduce;
399 
400   map = gs->companion;
401 
402   /* is there any local compression */
403   if (!gs->num_local) {
404     gs->local_strength = NONE;
405     gs->num_local_gop  = 0;
406   } else {
407     /* ok find intersection */
408     map    = gs->companion;
409     reduce = gs->local_reduce;
410     for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
411       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
412         t1++;
413         if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
414         gs->num_local_reduce[i] *= -1;
415       }
416       **reduce=map[**reduce];
417     }
418 
419     /* intersection is empty */
420     if (!t1) {
421       gs->local_strength = FULL;
422       gs->num_local_gop  = 0;
423     } else { /* intersection not empty */
424       gs->local_strength = PARTIAL;
425 
426       ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr);
427 
428       gs->num_local_gop        = t1;
429       gs->num_local_total      =  gs->num_local;
430       gs->num_local           -= t1;
431       gs->gop_local_reduce     = gs->local_reduce;
432       gs->num_gop_local_reduce = gs->num_local_reduce;
433 
434       for (i=0; i<t1; i++) {
435         if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
436         gs->num_gop_local_reduce[i] *= -1;
437         gs->local_reduce++;
438         gs->num_local_reduce++;
439       }
440       gs->local_reduce++;
441       gs->num_local_reduce++;
442     }
443   }
444 
445   elms = gs->pw_elm_list;
446   nel  = gs->len_pw_list;
447   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
448 
449   elms = gs->tree_map_in;
450   nel  = gs->tree_map_sz;
451   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
452 
453   /* clean up */
454   free((void*) gs->local_elms);
455   free((void*) gs->companion);
456   free((void*) gs->elms);
457   free((void*) gs->ngh_buf);
458   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
459   PetscFunctionReturn(0);
460 }
461 
462 /******************************************************************************/
463 static PetscErrorCode place_in_tree(PetscInt elm)
464 {
465   PetscInt *tp, n;
466 
467   PetscFunctionBegin;
468   if (ntree==tree_buf_sz) {
469     if (tree_buf_sz) {
470       tp           = tree_buf;
471       n            = tree_buf_sz;
472       tree_buf_sz<<=1;
473       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
474       PCTFS_ivec_copy(tree_buf,tp,n);
475       free(tp);
476     } else {
477       tree_buf_sz = TREE_BUF_SZ;
478       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
479     }
480   }
481 
482   tree_buf[ntree++] = elm;
483   PetscFunctionReturn(0);
484 }
485 
486 /******************************************************************************/
487 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
488 {
489   PetscInt       i, j, npw=0, ntree_map=0;
490   PetscInt       p_mask_size, ngh_buf_size, buf_size;
491   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
492   PetscInt       *ngh_buf, *buf1, *buf2;
493   PetscInt       offset, per_load, num_loads, or_ct, start, end;
494   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
495   PetscInt       oper=GL_B_OR;
496   PetscInt       *ptr3, *t_mask, level, ct1, ct2;
497   PetscErrorCode ierr;
498 
499   PetscFunctionBegin;
500   /* to make life easier */
501   nel   = gs->nel;
502   elms  = gs->elms;
503   level = gs->level;
504 
505   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
506   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
507   ierr   = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
508 
509   /* allocate space for masks and info bufs */
510   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
511   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
512   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
513   t_mask         = (PetscInt*) malloc(p_mask_size);
514   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
515 
516   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
517   /* had thought I could exploit rendezvous threshold */
518 
519   /* default is one pass */
520   per_load      = negl  = gs->negl;
521   gs->num_loads = num_loads = 1;
522   i             = p_mask_size*negl;
523 
524   /* possible overflow on buffer size */
525   /* overflow hack                    */
526   if (i<0) i=INT_MAX;
527 
528   buf_size = PetscMin(msg_buf,i);
529 
530   /* can we do it? */
531   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
532 
533   /* get PCTFS_giop buf space ... make *only* one malloc */
534   buf1 = (PetscInt*) malloc(buf_size<<1);
535 
536   /* more than one gior exchange needed? */
537   if (buf_size!=i) {
538     per_load      = buf_size/p_mask_size;
539     buf_size      = per_load*p_mask_size;
540     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
541   }
542 
543   /* convert buf sizes from #bytes to #ints - 32 bit only! */
544   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
545 
546   /* find PCTFS_giop work space */
547   buf2 = buf1+buf_size;
548 
549   /* hold #ints needed for processor masks */
550   gs->mask_sz=p_mask_size;
551 
552   /* init buffers */
553   ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr);
554   ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr);
555   ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr);
556 
557   /* HACK reset tree info */
558   tree_buf    = NULL;
559   tree_buf_sz = ntree = 0;
560 
561   /* ok do it */
562   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
563     /* identity for bitwise or is 000...000 */
564     PCTFS_ivec_zero(buf1,buf_size);
565 
566     /* load msg buffer */
567     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
568       offset = (offset-start)*p_mask_size;
569       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
570     }
571 
572     /* GLOBAL: pass buffer */
573     ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr);
574 
575     /* unload buffer into ngh_buf */
576     ptr2=(elms+i_start);
577     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
578       /* I own it ... may have to pairwise it */
579       if (j==*ptr2) {
580         /* do i share it w/anyone? */
581         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
582         /* guess not */
583         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
584 
585         /* i do ... so keep info and turn off my bit */
586         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
587         ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr);
588         ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
589 
590         /* is it to be done pairwise? */
591         if (--ct1<=level) {
592           npw++;
593 
594           /* turn on high bit to indicate pw need to process */
595           *ptr2++ |= TOP_BIT;
596           ierr     = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
597           ptr1    += p_mask_size;
598           continue;
599         }
600 
601         /* get set for next and note that I have a tree contribution */
602         /* could save exact elm index for tree here -> save a search */
603         ptr2++; ptr1+=p_mask_size; ntree_map++;
604       } else { /* i don't but still might be involved in tree */
605 
606         /* shared by how many? */
607         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
608 
609         /* none! */
610         if (ct1<2) continue;
611 
612         /* is it going to be done pairwise? but not by me of course!*/
613         if (--ct1<=level) continue;
614       }
615       /* LATER we're going to have to process it NOW */
616       /* nope ... tree it */
617       ierr = place_in_tree(j);CHKERRQ(ierr);
618     }
619   }
620 
621   free((void*)t_mask);
622   free((void*)buf1);
623 
624   gs->len_pw_list = npw;
625   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
626 
627   /* expand from bit mask list to int list and save ngh list */
628   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
629   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
630 
631   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
632 
633   oper         = GL_MAX;
634   ct1          = gs->num_nghs;
635   ierr         = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr);
636   gs->max_nghs = ct1;
637 
638   gs->tree_map_sz  = ntree_map;
639   gs->max_left_over=ntree;
640 
641   free((void*)p_mask);
642   free((void*)sh_proc_mask);
643   PetscFunctionReturn(0);
644 }
645 
646 /******************************************************************************/
647 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
648 {
649   PetscInt       i, j;
650   PetscInt       p_mask_size;
651   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
652   PetscInt       *ngh_buf, *buf2;
653   PetscInt       offset;
654   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
655   PetscInt       *pairwise_elm_list, len_pair_list=0;
656   PetscInt       *iptr, t1, i_start, nel, *elms;
657   PetscInt       ct;
658   PetscErrorCode ierr;
659 
660   PetscFunctionBegin;
661   /* to make life easier */
662   nel          = gs->nel;
663   elms         = gs->elms;
664   ngh_buf      = gs->ngh_buf;
665   sh_proc_mask = gs->pw_nghs;
666 
667   /* need a few temp masks */
668   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
669   p_mask        = (PetscInt*) malloc(p_mask_size);
670   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
671 
672   /* set mask to my PCTFS_my_id's bit mask */
673   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
674 
675   p_mask_size /= sizeof(PetscInt);
676 
677   len_pair_list   = gs->len_pw_list;
678   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
679 
680   /* how many processors (nghs) do we have to exchange with? */
681   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
682 
683   /* allocate space for PCTFS_gs_gop() info */
684   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
685   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
686   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
687 
688   /* init msg_size list */
689   ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr);
690 
691   /* expand from bit mask list to int list */
692   ierr = PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr);
693 
694   /* keep list of elements being handled pairwise */
695   for (i=j=0; i<nel; i++) {
696     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
697   }
698   pairwise_elm_list[j] = -1;
699 
700   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
701   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
702   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
703   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
704   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
705 
706   /* find who goes to each processor */
707   for (i_start=i=0; i<nprs; i++) {
708     /* processor i's mask */
709     ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr);
710 
711     /* det # going to processor i */
712     for (ct=j=0; j<len_pair_list; j++) {
713       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
714       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
715       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
716     }
717     msg_size[i] = ct;
718     i_start     = PetscMax(i_start,ct);
719 
720     /*space to hold nodes in message to first neighbor */
721     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
722 
723     for (j=0;j<len_pair_list;j++) {
724       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
725       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
726       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
727     }
728     *iptr = -1;
729   }
730   msg_nodes[nprs] = NULL;
731 
732   j                  = gs->loc_node_pairs=i_start;
733   t1                 = GL_MAX;
734   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
735   gs->max_node_pairs = i_start;
736 
737   i_start            = j;
738   t1                 = GL_MIN;
739   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
740   gs->min_node_pairs = i_start;
741 
742   i_start            = j;
743   t1                 = GL_ADD;
744   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
745   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
746 
747   i_start = nprs;
748   t1      = GL_MAX;
749   PCTFS_giop(&i_start,&offset,1,&t1);
750   gs->max_pairs = i_start;
751 
752   /* remap pairwise in tail of gsi_via_bit_mask() */
753   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
754   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
755   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
756 
757   /* reset malloc pool */
758   free((void*)p_mask);
759   free((void*)tmp_proc_mask);
760   PetscFunctionReturn(0);
761 }
762 
763 /* to do pruned tree just save ngh buf copy for each one and decode here!
764 ******************************************************************************/
765 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
766 {
767   PetscInt i, j, n, nel;
768   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
769 
770   PetscFunctionBegin;
771   /* local work ptrs */
772   elms = gs->elms;
773   nel  = gs->nel;
774 
775   /* how many via tree */
776   gs->tree_nel     = n = ntree;
777   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
778   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
779   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
780   j                = gs->tree_map_sz;
781   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
782   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
783 
784   /* search the longer of the two lists */
785   /* note ... could save this info in get_ngh_buf and save searches */
786   if (n<=nel) {
787     /* bijective fct w/remap - search elm list */
788     for (i=0; i<n; i++) {
789       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
790     }
791   } else {
792     for (i=0; i<nel; i++) {
793       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
794     }
795   }
796 
797   /* sentinel */
798   *iptr_in = *iptr_out = -1;
799   PetscFunctionReturn(0);
800 }
801 
802 /******************************************************************************/
803 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
804 {
805   PetscInt    *num, *map, **reduce;
806   PetscScalar tmp;
807 
808   PetscFunctionBegin;
809   num    = gs->num_gop_local_reduce;
810   reduce = gs->gop_local_reduce;
811   while ((map = *reduce++)) {
812     /* wall */
813     if (*num == 2) {
814       num++;
815       vals[map[1]] = vals[map[0]];
816     } else if (*num == 3) { /* corner shared by three elements */
817       num++;
818       vals[map[2]] = vals[map[1]] = vals[map[0]];
819     } else if (*num == 4) { /* corner shared by four elements */
820       num++;
821       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
822     } else { /* general case ... odd geoms ... 3D*/
823       num++;
824       tmp = *(vals + *map++);
825       while (*map >= 0) *(vals + *map++) = tmp;
826     }
827   }
828   PetscFunctionReturn(0);
829 }
830 
831 /******************************************************************************/
832 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
833 {
834   PetscInt    *num, *map, **reduce;
835   PetscScalar tmp;
836 
837   PetscFunctionBegin;
838   num    = gs->num_local_reduce;
839   reduce = gs->local_reduce;
840   while ((map = *reduce)) {
841     /* wall */
842     if (*num == 2) {
843       num++; reduce++;
844       vals[map[1]] = vals[map[0]] += vals[map[1]];
845     } else if (*num == 3) { /* corner shared by three elements */
846       num++; reduce++;
847       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
848     } else if (*num == 4) { /* corner shared by four elements */
849       num++; reduce++;
850       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
851     } else { /* general case ... odd geoms ... 3D*/
852       num++;
853       tmp = 0.0;
854       while (*map >= 0) tmp += *(vals + *map++);
855 
856       map = *reduce++;
857       while (*map >= 0) *(vals + *map++) = tmp;
858     }
859   }
860   PetscFunctionReturn(0);
861 }
862 
863 /******************************************************************************/
864 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
865 {
866   PetscInt    *num, *map, **reduce;
867   PetscScalar *base;
868 
869   PetscFunctionBegin;
870   num    = gs->num_gop_local_reduce;
871   reduce = gs->gop_local_reduce;
872   while ((map = *reduce++)) {
873     /* wall */
874     if (*num == 2) {
875       num++;
876       vals[map[0]] += vals[map[1]];
877     } else if (*num == 3) { /* corner shared by three elements */
878       num++;
879       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
880     } else if (*num == 4) { /* corner shared by four elements */
881       num++;
882       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
883     } else { /* general case ... odd geoms ... 3D*/
884       num++;
885       base = vals + *map++;
886       while (*map >= 0) *base += *(vals + *map++);
887     }
888   }
889   PetscFunctionReturn(0);
890 }
891 
892 /******************************************************************************/
893 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
894 {
895   PetscInt       i;
896   PetscErrorCode ierr;
897 
898   PetscFunctionBegin;
899   ierr = MPI_Comm_free(&gs->PCTFS_gs_comm);CHKERRMPI(ierr);
900   if (gs->nghs) free((void*) gs->nghs);
901   if (gs->pw_nghs) free((void*) gs->pw_nghs);
902 
903   /* tree */
904   if (gs->max_left_over) {
905     if (gs->tree_elms) free((void*) gs->tree_elms);
906     if (gs->tree_buf) free((void*) gs->tree_buf);
907     if (gs->tree_work) free((void*) gs->tree_work);
908     if (gs->tree_map_in) free((void*) gs->tree_map_in);
909     if (gs->tree_map_out) free((void*) gs->tree_map_out);
910   }
911 
912   /* pairwise info */
913   if (gs->num_pairs) {
914     /* should be NULL already */
915     if (gs->ngh_buf) free((void*) gs->ngh_buf);
916     if (gs->elms) free((void*) gs->elms);
917     if (gs->local_elms) free((void*) gs->local_elms);
918     if (gs->companion) free((void*) gs->companion);
919 
920     /* only set if pairwise */
921     if (gs->vals) free((void*) gs->vals);
922     if (gs->in) free((void*) gs->in);
923     if (gs->out) free((void*) gs->out);
924     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
925     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
926     if (gs->pw_vals) free((void*) gs->pw_vals);
927     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
928     if (gs->node_list) {
929       for (i=0;i<gs->num_pairs;i++) {
930         if (gs->node_list[i])  {
931           free((void*) gs->node_list[i]);
932         }
933       }
934       free((void*) gs->node_list);
935     }
936     if (gs->msg_sizes) free((void*) gs->msg_sizes);
937     if (gs->pair_list) free((void*) gs->pair_list);
938   }
939 
940   /* local info */
941   if (gs->num_local_total>=0) {
942     for (i=0;i<gs->num_local_total+1;i++) {
943       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
944     }
945   }
946 
947   /* if intersection tree/pairwise and local isn't empty */
948   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
949   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
950 
951   free((void*) gs);
952   PetscFunctionReturn(0);
953 }
954 
955 /******************************************************************************/
956 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
957 {
958   PetscErrorCode ierr;
959 
960   PetscFunctionBegin;
961   switch (*op) {
962   case '+':
963     PCTFS_gs_gop_vec_plus(gs,vals,step);
964     break;
965   default:
966     ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr);
967     ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");CHKERRQ(ierr);
968     PCTFS_gs_gop_vec_plus(gs,vals,step);
969     break;
970   }
971   PetscFunctionReturn(0);
972 }
973 
974 /******************************************************************************/
975 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
976 {
977   PetscFunctionBegin;
978   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
979 
980   /* local only operations!!! */
981   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
982 
983   /* if intersection tree/pairwise and local isn't empty */
984   if (gs->num_local_gop) {
985     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
986 
987     /* pairwise */
988     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
989 
990     /* tree */
991     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
992 
993     PCTFS_gs_gop_vec_local_out(gs,vals,step);
994   } else { /* if intersection tree/pairwise and local is empty */
995     /* pairwise */
996     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
997 
998     /* tree */
999     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1000   }
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 /******************************************************************************/
1005 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1006 {
1007   PetscInt    *num, *map, **reduce;
1008   PetscScalar *base;
1009 
1010   PetscFunctionBegin;
1011   num    = gs->num_local_reduce;
1012   reduce = gs->local_reduce;
1013   while ((map = *reduce)) {
1014     base = vals + map[0] * step;
1015 
1016     /* wall */
1017     if (*num == 2) {
1018       num++; reduce++;
1019       PCTFS_rvec_add (base,vals+map[1]*step,step);
1020       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1021     } else if (*num == 3) { /* corner shared by three elements */
1022       num++; reduce++;
1023       PCTFS_rvec_add (base,vals+map[1]*step,step);
1024       PCTFS_rvec_add (base,vals+map[2]*step,step);
1025       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1026       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1027     } else if (*num == 4) { /* corner shared by four elements */
1028       num++; reduce++;
1029       PCTFS_rvec_add (base,vals+map[1]*step,step);
1030       PCTFS_rvec_add (base,vals+map[2]*step,step);
1031       PCTFS_rvec_add (base,vals+map[3]*step,step);
1032       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1033       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1034       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1035     } else { /* general case ... odd geoms ... 3D */
1036       num++;
1037       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1038 
1039       map = *reduce;
1040       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1041 
1042       reduce++;
1043     }
1044   }
1045   PetscFunctionReturn(0);
1046 }
1047 
1048 /******************************************************************************/
1049 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1050 {
1051   PetscInt    *num, *map, **reduce;
1052   PetscScalar *base;
1053 
1054   PetscFunctionBegin;
1055   num    = gs->num_gop_local_reduce;
1056   reduce = gs->gop_local_reduce;
1057   while ((map = *reduce++)) {
1058     base = vals + map[0] * step;
1059 
1060     /* wall */
1061     if (*num == 2) {
1062       num++;
1063       PCTFS_rvec_add(base,vals+map[1]*step,step);
1064     } else if (*num == 3) { /* corner shared by three elements */
1065       num++;
1066       PCTFS_rvec_add(base,vals+map[1]*step,step);
1067       PCTFS_rvec_add(base,vals+map[2]*step,step);
1068     } else if (*num == 4) { /* corner shared by four elements */
1069       num++;
1070       PCTFS_rvec_add(base,vals+map[1]*step,step);
1071       PCTFS_rvec_add(base,vals+map[2]*step,step);
1072       PCTFS_rvec_add(base,vals+map[3]*step,step);
1073     } else { /* general case ... odd geoms ... 3D*/
1074       num++;
1075       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1076     }
1077   }
1078   PetscFunctionReturn(0);
1079 }
1080 
1081 /******************************************************************************/
1082 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1083 {
1084   PetscInt    *num, *map, **reduce;
1085   PetscScalar *base;
1086 
1087   PetscFunctionBegin;
1088   num    = gs->num_gop_local_reduce;
1089   reduce = gs->gop_local_reduce;
1090   while ((map = *reduce++)) {
1091     base = vals + map[0] * step;
1092 
1093     /* wall */
1094     if (*num == 2) {
1095       num++;
1096       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1097     } else if (*num == 3) { /* corner shared by three elements */
1098       num++;
1099       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1100       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1101     } else if (*num == 4) { /* corner shared by four elements */
1102       num++;
1103       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1104       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1105       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1106     } else { /* general case ... odd geoms ... 3D*/
1107       num++;
1108       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1109     }
1110   }
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 /******************************************************************************/
1115 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1116 {
1117   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1118   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1119   PetscInt       *pw, *list, *size, **nodes;
1120   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1121   MPI_Status     status;
1122   PetscBLASInt   i1 = 1,dstep;
1123   PetscErrorCode ierr;
1124 
1125   PetscFunctionBegin;
1126   /* strip and load s */
1127   msg_list    = list     = gs->pair_list;
1128   msg_size    = size     = gs->msg_sizes;
1129   msg_nodes   = nodes    = gs->node_list;
1130   iptr        = pw       = gs->pw_elm_list;
1131   dptr1       = dptr3    = gs->pw_vals;
1132   msg_ids_in  = ids_in   = gs->msg_ids_in;
1133   msg_ids_out = ids_out  = gs->msg_ids_out;
1134   dptr2                  = gs->out;
1135   in1=in2                = gs->in;
1136 
1137   /* post the receives */
1138   /*  msg_nodes=nodes; */
1139   do {
1140     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1141         second one *list and do list++ afterwards */
1142     ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr);
1143     list++;msg_ids_in++;
1144     in1 += *size++ *step;
1145   } while (*++msg_nodes);
1146   msg_nodes=nodes;
1147 
1148   /* load gs values into in out gs buffers */
1149   while (*iptr >= 0) {
1150     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1151     dptr3+=step;
1152     iptr++;
1153   }
1154 
1155   /* load out buffers and post the sends */
1156   while ((iptr = *msg_nodes++)) {
1157     dptr3 = dptr2;
1158     while (*iptr >= 0) {
1159       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1160       dptr2+=step;
1161       iptr++;
1162     }
1163     ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr);
1164     msg_size++; msg_list++;msg_ids_out++;
1165   }
1166 
1167   /* tree */
1168   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1169 
1170   /* process the received data */
1171   msg_nodes=nodes;
1172   while ((iptr = *nodes++)) {
1173     PetscScalar d1 = 1.0;
1174 
1175     /* Should I check the return value of MPI_Wait() or status? */
1176     /* Can this loop be replaced by a call to MPI_Waitall()? */
1177     ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr);
1178     ids_in++;
1179     while (*iptr >= 0) {
1180       ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1181       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1182       in2+=step;
1183       iptr++;
1184     }
1185   }
1186 
1187   /* replace vals */
1188   while (*pw >= 0) {
1189     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1190     dptr1+=step;
1191     pw++;
1192   }
1193 
1194   /* clear isend message handles */
1195   /* This changed for clarity though it could be the same */
1196 
1197   /* Should I check the return value of MPI_Wait() or status? */
1198   /* Can this loop be replaced by a call to MPI_Waitall()? */
1199   while (*msg_nodes++) {
1200     ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr);
1201     ids_out++;
1202   }
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 /******************************************************************************/
1207 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1208 {
1209   PetscInt       size, *in, *out;
1210   PetscScalar    *buf, *work;
1211   PetscInt       op[] = {GL_ADD,0};
1212   PetscBLASInt   i1   = 1;
1213   PetscErrorCode ierr;
1214   PetscBLASInt   dstep;
1215 
1216   PetscFunctionBegin;
1217   /* copy over to local variables */
1218   in   = gs->tree_map_in;
1219   out  = gs->tree_map_out;
1220   buf  = gs->tree_buf;
1221   work = gs->tree_work;
1222   size = gs->tree_nel*step;
1223 
1224   /* zero out collection buffer */
1225   PCTFS_rvec_zero(buf,size);
1226 
1227   /* copy over my contributions */
1228   while (*in >= 0) {
1229     ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1230     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1231   }
1232 
1233   /* perform fan in/out on full buffer */
1234   /* must change PCTFS_grop to handle the blas */
1235   PCTFS_grop(buf,work,size,op);
1236 
1237   /* reset */
1238   in  = gs->tree_map_in;
1239   out = gs->tree_map_out;
1240 
1241   /* get the portion of the results I need */
1242   while (*in >= 0) {
1243     ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1244     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1245   }
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 /******************************************************************************/
1250 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1251 {
1252   PetscErrorCode ierr;
1253 
1254   PetscFunctionBegin;
1255   switch (*op) {
1256   case '+':
1257     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1258     break;
1259   default:
1260     ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr);
1261     ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr);
1262     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1263     break;
1264   }
1265   PetscFunctionReturn(0);
1266 }
1267 
1268 /******************************************************************************/
1269 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1270 {
1271   PetscFunctionBegin;
1272   /* if there's nothing to do return */
1273   if (dim<=0) PetscFunctionReturn(0);
1274 
1275   /* can't do more dimensions then exist */
1276   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1277 
1278   /* local only operations!!! */
1279   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1280 
1281   /* if intersection tree/pairwise and local isn't empty */
1282   if (gs->num_local_gop) {
1283     PCTFS_gs_gop_local_in_plus(gs,vals);
1284 
1285     /* pairwise will do tree inside ... */
1286     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1287     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1288 
1289     PCTFS_gs_gop_local_out(gs,vals);
1290   } else { /* if intersection tree/pairwise and local is empty */
1291     /* pairwise will do tree inside */
1292     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1293     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1294   }
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 /******************************************************************************/
1299 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1300 {
1301   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1302   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1303   PetscInt       *pw, *list, *size, **nodes;
1304   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1305   MPI_Status     status;
1306   PetscInt       i, mask=1;
1307   PetscErrorCode ierr;
1308 
1309   PetscFunctionBegin;
1310   for (i=1; i<dim; i++) { mask<<=1; mask++; }
1311 
1312   /* strip and load s */
1313   msg_list    = list     = gs->pair_list;
1314   msg_size    = size     = gs->msg_sizes;
1315   msg_nodes   = nodes    = gs->node_list;
1316   iptr        = pw       = gs->pw_elm_list;
1317   dptr1       = dptr3    = gs->pw_vals;
1318   msg_ids_in  = ids_in   = gs->msg_ids_in;
1319   msg_ids_out = ids_out  = gs->msg_ids_out;
1320   dptr2       = gs->out;
1321   in1         = in2      = gs->in;
1322 
1323   /* post the receives */
1324   /*  msg_nodes=nodes; */
1325   do {
1326     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1327         second one *list and do list++ afterwards */
1328     if ((PCTFS_my_id|mask)==(*list|mask)) {
1329       ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr);
1330       list++; msg_ids_in++;in1 += *size++;
1331     } else { list++; size++; }
1332   } while (*++msg_nodes);
1333 
1334   /* load gs values into in out gs buffers */
1335   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1336 
1337   /* load out buffers and post the sends */
1338   msg_nodes=nodes;
1339   list     = msg_list;
1340   while ((iptr = *msg_nodes++)) {
1341     if ((PCTFS_my_id|mask)==(*list|mask)) {
1342       dptr3 = dptr2;
1343       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1344       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1345       /* is msg_ids_out++ correct? */
1346       ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr);
1347       msg_size++;list++;msg_ids_out++;
1348     } else {list++; msg_size++;}
1349   }
1350 
1351   /* do the tree while we're waiting */
1352   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1353 
1354   /* process the received data */
1355   msg_nodes=nodes;
1356   list     = msg_list;
1357   while ((iptr = *nodes++)) {
1358     if ((PCTFS_my_id|mask)==(*list|mask)) {
1359       /* Should I check the return value of MPI_Wait() or status? */
1360       /* Can this loop be replaced by a call to MPI_Waitall()? */
1361       ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr);
1362       ids_in++;
1363       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1364     }
1365     list++;
1366   }
1367 
1368   /* replace vals */
1369   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1370 
1371   /* clear isend message handles */
1372   /* This changed for clarity though it could be the same */
1373   while (*msg_nodes++) {
1374     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1375       /* Should I check the return value of MPI_Wait() or status? */
1376       /* Can this loop be replaced by a call to MPI_Waitall()? */
1377       ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr);
1378       ids_out++;
1379     }
1380     msg_list++;
1381   }
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 /******************************************************************************/
1386 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1387 {
1388   PetscInt    size;
1389   PetscInt    *in, *out;
1390   PetscScalar *buf, *work;
1391   PetscInt    op[] = {GL_ADD,0};
1392 
1393   PetscFunctionBegin;
1394   in   = gs->tree_map_in;
1395   out  = gs->tree_map_out;
1396   buf  = gs->tree_buf;
1397   work = gs->tree_work;
1398   size = gs->tree_nel;
1399 
1400   PCTFS_rvec_zero(buf,size);
1401 
1402   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1403 
1404   in  = gs->tree_map_in;
1405   out = gs->tree_map_out;
1406 
1407   PCTFS_grop_hc(buf,work,size,op,dim);
1408 
1409   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1410   PetscFunctionReturn(0);
1411 }
1412 
1413