xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision bebe2cf65d55febe21a5af8db2bd2e168caaa2e7)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 
30 
31 /***********************************gs.c***************************************
32 Type: struct gather_scatter_id
33 ------------------------------
34 
35 ************************************gs.c**************************************/
36 typedef struct gather_scatter_id {
37   PetscInt    id;
38   PetscInt    nel_min;
39   PetscInt    nel_max;
40   PetscInt    nel_sum;
41   PetscInt    negl;
42   PetscInt    gl_max;
43   PetscInt    gl_min;
44   PetscInt    repeats;
45   PetscInt    ordered;
46   PetscInt    positive;
47   PetscScalar *vals;
48 
49   /* bit mask info */
50   PetscInt *my_proc_mask;
51   PetscInt mask_sz;
52   PetscInt *ngh_buf;
53   PetscInt ngh_buf_sz;
54   PetscInt *nghs;
55   PetscInt num_nghs;
56   PetscInt max_nghs;
57   PetscInt *pw_nghs;
58   PetscInt num_pw_nghs;
59   PetscInt *tree_nghs;
60   PetscInt num_tree_nghs;
61 
62   PetscInt num_loads;
63 
64   /* repeats == true -> local info */
65   PetscInt nel;         /* number of unique elememts */
66   PetscInt *elms;       /* of size nel */
67   PetscInt nel_total;
68   PetscInt *local_elms; /* of size nel_total */
69   PetscInt *companion;  /* of size nel_total */
70 
71   /* local info */
72   PetscInt num_local_total;
73   PetscInt local_strength;
74   PetscInt num_local;
75   PetscInt *num_local_reduce;
76   PetscInt **local_reduce;
77   PetscInt num_local_gop;
78   PetscInt *num_gop_local_reduce;
79   PetscInt **gop_local_reduce;
80 
81   /* pairwise info */
82   PetscInt    level;
83   PetscInt    num_pairs;
84   PetscInt    max_pairs;
85   PetscInt    loc_node_pairs;
86   PetscInt    max_node_pairs;
87   PetscInt    min_node_pairs;
88   PetscInt    avg_node_pairs;
89   PetscInt    *pair_list;
90   PetscInt    *msg_sizes;
91   PetscInt    **node_list;
92   PetscInt    len_pw_list;
93   PetscInt    *pw_elm_list;
94   PetscScalar *pw_vals;
95 
96   MPI_Request *msg_ids_in;
97   MPI_Request *msg_ids_out;
98 
99   PetscScalar *out;
100   PetscScalar *in;
101   PetscInt    msg_total;
102 
103   /* tree - crystal accumulator info */
104   PetscInt max_left_over;
105   PetscInt *pre;
106   PetscInt *in_num;
107   PetscInt *out_num;
108   PetscInt **in_list;
109   PetscInt **out_list;
110 
111   /* new tree work*/
112   PetscInt    tree_nel;
113   PetscInt    *tree_elms;
114   PetscScalar *tree_buf;
115   PetscScalar *tree_work;
116 
117   PetscInt tree_map_sz;
118   PetscInt *tree_map_in;
119   PetscInt *tree_map_out;
120 
121   /* current memory status */
122   PetscInt gl_bss_min;
123   PetscInt gl_perm_min;
124 
125   /* max segment size for PCTFS_gs_gop_vec() */
126   PetscInt vec_sz;
127 
128   /* hack to make paul happy */
129   MPI_Comm PCTFS_gs_comm;
130 
131 } PCTFS_gs_id;
132 
133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
137 static PCTFS_gs_id *gsi_new(void);
138 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
139 
140 /* same for all but vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
142 /* vector flavor */
143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
144 
145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
150 
151 
152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
154 
155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
158 
159 /* global vars */
160 /* from comm.c module */
161 
162 static PetscInt num_gs_ids = 0;
163 
164 /* should make this dynamic ... later */
165 static PetscInt msg_buf    =MAX_MSG_BUF;
166 static PetscInt vec_sz     =GS_VEC_SZ;
167 static PetscInt *tree_buf  =NULL;
168 static PetscInt tree_buf_sz=0;
169 static PetscInt ntree      =0;
170 
171 /***************************************************************************/
172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
173 {
174   PetscFunctionBegin;
175   vec_sz = size;
176   PetscFunctionReturn(0);
177 }
178 
179 /******************************************************************************/
180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
181 {
182   PetscFunctionBegin;
183   msg_buf = buf_size;
184   PetscFunctionReturn(0);
185 }
186 
187 /******************************************************************************/
188 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
189 {
190   PCTFS_gs_id    *gs;
191   MPI_Group      PCTFS_gs_group;
192   MPI_Comm       PCTFS_gs_comm;
193   PetscErrorCode ierr;
194 
195   PetscFunctionBegin;
196   /* ensure that communication package has been initialized */
197   PCTFS_comm_init();
198 
199 
200   /* determines if we have enough dynamic/semi-static memory */
201   /* checks input, allocs and sets gd_id template            */
202   gs = gsi_check_args(elms,nel,level);
203 
204   /* only bit mask version up and working for the moment    */
205   /* LATER :: get int list version working for sparse pblms */
206   ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
207 
208 
209   ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr);
210   ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr);
211 
212   gs->PCTFS_gs_comm=PCTFS_gs_comm;
213 
214   return(gs);
215 }
216 
217 /******************************************************************************/
218 static PCTFS_gs_id *gsi_new(void)
219 {
220   PetscErrorCode ierr;
221   PCTFS_gs_id    *gs;
222   gs   = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
223   ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr);
224   return(gs);
225 }
226 
227 /******************************************************************************/
228 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
229 {
230   PetscInt       i, j, k, t2;
231   PetscInt       *companion, *elms, *unique, *iptr;
232   PetscInt       num_local=0, *num_to_reduce, **local_reduce;
233   PetscInt       oprs[]   = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
234   PetscInt       vals[sizeof(oprs)/sizeof(oprs[0])-1];
235   PetscInt       work[sizeof(oprs)/sizeof(oprs[0])-1];
236   PCTFS_gs_id    *gs;
237   PetscErrorCode ierr;
238 
239 
240   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
241   if (nel<0)    SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
242 
243   if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
244 
245   /* get space for gs template */
246   gs     = gsi_new();
247   gs->id = ++num_gs_ids;
248 
249   /* hmt 6.4.99                                            */
250   /* caller can set global ids that don't participate to 0 */
251   /* PCTFS_gs_init ignores all zeros in elm list                 */
252   /* negative global ids are still invalid                 */
253   for (i=j=0; i<nel; i++) {
254     if (in_elms[i]!=0) j++;
255   }
256 
257   k=nel; nel=j;
258 
259   /* copy over in_elms list and create inverse map */
260   elms      = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
261   companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
262 
263   for (i=j=0; i<k; i++) {
264     if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
265   }
266 
267   if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
268 
269   /* pre-pass ... check to see if sorted */
270   elms[nel] = INT_MAX;
271   iptr      = elms;
272   unique    = elms+1;
273   j         =0;
274   while (*iptr!=INT_MAX) {
275     if (*iptr++>*unique++) { j=1; break; }
276   }
277 
278   /* set up inverse map */
279   if (j) {
280     ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);
281     ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr);
282   } else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); }
283   elms[nel] = INT_MIN;
284 
285   /* first pass */
286   /* determine number of unique elements, check pd */
287   for (i=k=0; i<nel; i+=j) {
288     t2 = elms[i];
289     j  = ++i;
290 
291     /* clump 'em for now */
292     while (elms[j]==t2) j++;
293 
294     /* how many together and num local */
295     if (j-=i) { num_local++; k+=j; }
296   }
297 
298   /* how many unique elements? */
299   gs->repeats = k;
300   gs->nel     = nel-k;
301 
302 
303   /* number of repeats? */
304   gs->num_local        = num_local;
305   num_local           += 2;
306   gs->local_reduce     = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
307   gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
308 
309   unique         = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
310   gs->elms       = unique;
311   gs->nel_total  = nel;
312   gs->local_elms = elms;
313   gs->companion  = companion;
314 
315   /* compess map as well as keep track of local ops */
316   for (num_local=i=j=0; i<gs->nel; i++) {
317     k            = j;
318     t2           = unique[i] = elms[j];
319     companion[i] = companion[j];
320 
321     while (elms[j]==t2) j++;
322 
323     if ((t2=(j-k))>1) {
324       /* number together */
325       num_to_reduce[num_local] = t2++;
326 
327       iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
328 
329       /* to use binary searching don't remap until we check intersection */
330       *iptr++ = i;
331 
332       /* note that we're skipping the first one */
333       while (++k<j) *(iptr++) = companion[k];
334       *iptr = -1;
335     }
336   }
337 
338   /* sentinel for ngh_buf */
339   unique[gs->nel]=INT_MAX;
340 
341   /* for two partition sort hack */
342   num_to_reduce[num_local]   = 0;
343   local_reduce[num_local]    = NULL;
344   num_to_reduce[++num_local] = 0;
345   local_reduce[num_local]    = NULL;
346 
347   /* load 'em up */
348   /* note one extra to hold NON_UNIFORM flag!!! */
349   vals[2] = vals[1] = vals[0] = nel;
350   if (gs->nel>0) {
351     vals[3] = unique[0];
352     vals[4] = unique[gs->nel-1];
353   } else {
354     vals[3] = INT_MAX;
355     vals[4] = INT_MIN;
356   }
357   vals[5] = level;
358   vals[6] = num_gs_ids;
359 
360   /* GLOBAL: send 'em out */
361   ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr);
362 
363   /* must be semi-pos def - only pairwise depends on this */
364   /* LATER - remove this restriction */
365   if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
366   if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
367 
368   gs->nel_min = vals[0];
369   gs->nel_max = vals[1];
370   gs->nel_sum = vals[2];
371   gs->gl_min  = vals[3];
372   gs->gl_max  = vals[4];
373   gs->negl    = vals[4]-vals[3]+1;
374 
375   if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
376 
377   /* LATER :: add level == -1 -> program selects level */
378   if (vals[5]<0) vals[5]=0;
379   else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
380   gs->level = vals[5];
381 
382   return(gs);
383 }
384 
385 /******************************************************************************/
386 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
387 {
388   PetscInt       i, nel, *elms;
389   PetscInt       t1;
390   PetscInt       **reduce;
391   PetscInt       *map;
392   PetscErrorCode ierr;
393 
394   PetscFunctionBegin;
395   /* totally local removes ... PCTFS_ct_bits == 0 */
396   get_ngh_buf(gs);
397 
398   if (gs->level) set_pairwise(gs);
399   if (gs->max_left_over) set_tree(gs);
400 
401   /* intersection local and pairwise/tree? */
402   gs->num_local_total      = gs->num_local;
403   gs->gop_local_reduce     = gs->local_reduce;
404   gs->num_gop_local_reduce = gs->num_local_reduce;
405 
406   map = gs->companion;
407 
408   /* is there any local compression */
409   if (!gs->num_local) {
410     gs->local_strength = NONE;
411     gs->num_local_gop  = 0;
412   } else {
413     /* ok find intersection */
414     map    = gs->companion;
415     reduce = gs->local_reduce;
416     for (i=0, t1=0; i<gs->num_local; i++, reduce++)
417     {
418       if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0)
419           ||
420           PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
421         t1++;
422         if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?");
423         gs->num_local_reduce[i] *= -1;
424       }
425       **reduce=map[**reduce];
426     }
427 
428     /* intersection is empty */
429     if (!t1) {
430       gs->local_strength = FULL;
431       gs->num_local_gop  = 0;
432     } else { /* intersection not empty */
433       gs->local_strength = PARTIAL;
434 
435       ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr);
436 
437       gs->num_local_gop        = t1;
438       gs->num_local_total      =  gs->num_local;
439       gs->num_local           -= t1;
440       gs->gop_local_reduce     = gs->local_reduce;
441       gs->num_gop_local_reduce = gs->num_local_reduce;
442 
443       for (i=0; i<t1; i++) {
444         if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?");
445         gs->num_gop_local_reduce[i] *= -1;
446         gs->local_reduce++;
447         gs->num_local_reduce++;
448       }
449       gs->local_reduce++;
450       gs->num_local_reduce++;
451     }
452   }
453 
454   elms = gs->pw_elm_list;
455   nel  = gs->len_pw_list;
456   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
457 
458   elms = gs->tree_map_in;
459   nel  = gs->tree_map_sz;
460   for (i=0; i<nel; i++) elms[i] = map[elms[i]];
461 
462   /* clean up */
463   free((void*) gs->local_elms);
464   free((void*) gs->companion);
465   free((void*) gs->elms);
466   free((void*) gs->ngh_buf);
467   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
468   PetscFunctionReturn(0);
469 }
470 
471 /******************************************************************************/
472 static PetscErrorCode place_in_tree(PetscInt elm)
473 {
474   PetscInt *tp, n;
475 
476   PetscFunctionBegin;
477   if (ntree==tree_buf_sz) {
478     if (tree_buf_sz) {
479       tp           = tree_buf;
480       n            = tree_buf_sz;
481       tree_buf_sz<<=1;
482       tree_buf     = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
483       PCTFS_ivec_copy(tree_buf,tp,n);
484       free(tp);
485     } else {
486       tree_buf_sz = TREE_BUF_SZ;
487       tree_buf    = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
488     }
489   }
490 
491   tree_buf[ntree++] = elm;
492   PetscFunctionReturn(0);
493 }
494 
495 /******************************************************************************/
496 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
497 {
498   PetscInt       i, j, npw=0, ntree_map=0;
499   PetscInt       p_mask_size, ngh_buf_size, buf_size;
500   PetscInt       *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
501   PetscInt       *ngh_buf, *buf1, *buf2;
502   PetscInt       offset, per_load, num_loads, or_ct, start, end;
503   PetscInt       *ptr1, *ptr2, i_start, negl, nel, *elms;
504   PetscInt       oper=GL_B_OR;
505   PetscInt       *ptr3, *t_mask, level, ct1, ct2;
506   PetscErrorCode ierr;
507 
508   PetscFunctionBegin;
509   /* to make life easier */
510   nel   = gs->nel;
511   elms  = gs->elms;
512   level = gs->level;
513 
514   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
515   p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
516   ierr   = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
517 
518   /* allocate space for masks and info bufs */
519   gs->nghs       = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
520   gs->pw_nghs    = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
521   gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
522   t_mask         = (PetscInt*) malloc(p_mask_size);
523   gs->ngh_buf    = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
524 
525   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
526   /* had thought I could exploit rendezvous threshold */
527 
528   /* default is one pass */
529   per_load      = negl  = gs->negl;
530   gs->num_loads = num_loads = 1;
531   i             = p_mask_size*negl;
532 
533   /* possible overflow on buffer size */
534   /* overflow hack                    */
535   if (i<0) i=INT_MAX;
536 
537   buf_size = PetscMin(msg_buf,i);
538 
539   /* can we do it? */
540   if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size);
541 
542   /* get PCTFS_giop buf space ... make *only* one malloc */
543   buf1 = (PetscInt*) malloc(buf_size<<1);
544 
545   /* more than one gior exchange needed? */
546   if (buf_size!=i) {
547     per_load      = buf_size/p_mask_size;
548     buf_size      = per_load*p_mask_size;
549     gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
550   }
551 
552 
553   /* convert buf sizes from #bytes to #ints - 32 bit only! */
554   p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
555 
556   /* find PCTFS_giop work space */
557   buf2 = buf1+buf_size;
558 
559   /* hold #ints needed for processor masks */
560   gs->mask_sz=p_mask_size;
561 
562   /* init buffers */
563   ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr);
564   ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr);
565   ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr);
566 
567   /* HACK reset tree info */
568   tree_buf    = NULL;
569   tree_buf_sz = ntree = 0;
570 
571   /* ok do it */
572   for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
573     /* identity for bitwise or is 000...000 */
574     PCTFS_ivec_zero(buf1,buf_size);
575 
576     /* load msg buffer */
577     for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
578       offset = (offset-start)*p_mask_size;
579       PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
580     }
581 
582     /* GLOBAL: pass buffer */
583     ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr);
584 
585 
586     /* unload buffer into ngh_buf */
587     ptr2=(elms+i_start);
588     for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
589       /* I own it ... may have to pairwise it */
590       if (j==*ptr2) {
591         /* do i share it w/anyone? */
592         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
593         /* guess not */
594         if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
595 
596         /* i do ... so keep info and turn off my bit */
597         PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
598         ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr);
599         ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
600 
601         /* is it to be done pairwise? */
602         if (--ct1<=level) {
603           npw++;
604 
605           /* turn on high bit to indicate pw need to process */
606           *ptr2++ |= TOP_BIT;
607           ierr     = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr);
608           ptr1    += p_mask_size;
609           continue;
610         }
611 
612         /* get set for next and note that I have a tree contribution */
613         /* could save exact elm index for tree here -> save a search */
614         ptr2++; ptr1+=p_mask_size; ntree_map++;
615       } else { /* i don't but still might be involved in tree */
616 
617         /* shared by how many? */
618         ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
619 
620         /* none! */
621         if (ct1<2) continue;
622 
623         /* is it going to be done pairwise? but not by me of course!*/
624         if (--ct1<=level) continue;
625       }
626       /* LATER we're going to have to process it NOW */
627       /* nope ... tree it */
628       ierr = place_in_tree(j);CHKERRQ(ierr);
629     }
630   }
631 
632   free((void*)t_mask);
633   free((void*)buf1);
634 
635   gs->len_pw_list = npw;
636   gs->num_nghs    = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
637 
638   /* expand from bit mask list to int list and save ngh list */
639   gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
640   PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
641 
642   gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
643 
644   oper         = GL_MAX;
645   ct1          = gs->num_nghs;
646   ierr         = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr);
647   gs->max_nghs = ct1;
648 
649   gs->tree_map_sz  = ntree_map;
650   gs->max_left_over=ntree;
651 
652   free((void*)p_mask);
653   free((void*)sh_proc_mask);
654   PetscFunctionReturn(0);
655 }
656 
657 /******************************************************************************/
658 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
659 {
660   PetscInt       i, j;
661   PetscInt       p_mask_size;
662   PetscInt       *p_mask, *sh_proc_mask, *tmp_proc_mask;
663   PetscInt       *ngh_buf, *buf2;
664   PetscInt       offset;
665   PetscInt       *msg_list, *msg_size, **msg_nodes, nprs;
666   PetscInt       *pairwise_elm_list, len_pair_list=0;
667   PetscInt       *iptr, t1, i_start, nel, *elms;
668   PetscInt       ct;
669   PetscErrorCode ierr;
670 
671   PetscFunctionBegin;
672   /* to make life easier */
673   nel          = gs->nel;
674   elms         = gs->elms;
675   ngh_buf      = gs->ngh_buf;
676   sh_proc_mask = gs->pw_nghs;
677 
678   /* need a few temp masks */
679   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
680   p_mask        = (PetscInt*) malloc(p_mask_size);
681   tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
682 
683   /* set mask to my PCTFS_my_id's bit mask */
684   ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr);
685 
686   p_mask_size /= sizeof(PetscInt);
687 
688   len_pair_list   = gs->len_pw_list;
689   gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
690 
691   /* how many processors (nghs) do we have to exchange with? */
692   nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
693 
694 
695   /* allocate space for PCTFS_gs_gop() info */
696   gs->pair_list = msg_list  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
697   gs->msg_sizes = msg_size  = (PetscInt*)  malloc(sizeof(PetscInt)*nprs);
698   gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
699 
700   /* init msg_size list */
701   ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr);
702 
703   /* expand from bit mask list to int list */
704   ierr = PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr);
705 
706   /* keep list of elements being handled pairwise */
707   for (i=j=0; i<nel; i++) {
708     if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
709   }
710   pairwise_elm_list[j] = -1;
711 
712   gs->msg_ids_out       = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
713   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
714   gs->msg_ids_in        = (MPI_Request*)  malloc(sizeof(MPI_Request)*(nprs+1));
715   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
716   gs->pw_vals           = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
717 
718   /* find who goes to each processor */
719   for (i_start=i=0; i<nprs; i++) {
720     /* processor i's mask */
721     ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr);
722 
723     /* det # going to processor i */
724     for (ct=j=0; j<len_pair_list; j++) {
725       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
726       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
727       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
728     }
729     msg_size[i] = ct;
730     i_start     = PetscMax(i_start,ct);
731 
732     /*space to hold nodes in message to first neighbor */
733     msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
734 
735     for (j=0;j<len_pair_list;j++) {
736       buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
737       ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr);
738       if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
739     }
740     *iptr = -1;
741   }
742   msg_nodes[nprs] = NULL;
743 
744   j                  = gs->loc_node_pairs=i_start;
745   t1                 = GL_MAX;
746   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
747   gs->max_node_pairs = i_start;
748 
749   i_start            = j;
750   t1                 = GL_MIN;
751   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
752   gs->min_node_pairs = i_start;
753 
754   i_start            = j;
755   t1                 = GL_ADD;
756   ierr               = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr);
757   gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
758 
759   i_start = nprs;
760   t1      = GL_MAX;
761   PCTFS_giop(&i_start,&offset,1,&t1);
762   gs->max_pairs = i_start;
763 
764 
765   /* remap pairwise in tail of gsi_via_bit_mask() */
766   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
767   gs->out       = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
768   gs->in        = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
769 
770   /* reset malloc pool */
771   free((void*)p_mask);
772   free((void*)tmp_proc_mask);
773   PetscFunctionReturn(0);
774 }
775 
776 /* to do pruned tree just save ngh buf copy for each one and decode here!
777 ******************************************************************************/
778 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
779 {
780   PetscInt i, j, n, nel;
781   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
782 
783   PetscFunctionBegin;
784   /* local work ptrs */
785   elms = gs->elms;
786   nel  = gs->nel;
787 
788   /* how many via tree */
789   gs->tree_nel     = n = ntree;
790   gs->tree_elms    = tree_elms = iptr_in = tree_buf;
791   gs->tree_buf     = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
792   gs->tree_work    = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
793   j                = gs->tree_map_sz;
794   gs->tree_map_in  = iptr_in  = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
795   gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
796 
797   /* search the longer of the two lists */
798   /* note ... could save this info in get_ngh_buf and save searches */
799   if (n<=nel) {
800     /* bijective fct w/remap - search elm list */
801     for (i=0; i<n; i++) {
802       if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
803     }
804   } else {
805     for (i=0; i<nel; i++) {
806       if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
807     }
808   }
809 
810   /* sentinel */
811   *iptr_in = *iptr_out = -1;
812   PetscFunctionReturn(0);
813 }
814 
815 /******************************************************************************/
816 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs,  PetscScalar *vals)
817 {
818   PetscInt    *num, *map, **reduce;
819   PetscScalar tmp;
820 
821   PetscFunctionBegin;
822   num    = gs->num_gop_local_reduce;
823   reduce = gs->gop_local_reduce;
824   while ((map = *reduce++)) {
825     /* wall */
826     if (*num == 2) {
827       num++;
828       vals[map[1]] = vals[map[0]];
829     } else if (*num == 3) { /* corner shared by three elements */
830       num++;
831       vals[map[2]] = vals[map[1]] = vals[map[0]];
832     } else if (*num == 4) { /* corner shared by four elements */
833       num++;
834       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
835     } else { /* general case ... odd geoms ... 3D*/
836       num++;
837       tmp = *(vals + *map++);
838       while (*map >= 0) *(vals + *map++) = tmp;
839     }
840   }
841   PetscFunctionReturn(0);
842 }
843 
844 /******************************************************************************/
845 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
846 {
847   PetscInt    *num, *map, **reduce;
848   PetscScalar tmp;
849 
850   PetscFunctionBegin;
851   num    = gs->num_local_reduce;
852   reduce = gs->local_reduce;
853   while ((map = *reduce)) {
854     /* wall */
855     if (*num == 2) {
856       num++; reduce++;
857       vals[map[1]] = vals[map[0]] += vals[map[1]];
858     } else if (*num == 3) { /* corner shared by three elements */
859       num++; reduce++;
860       vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
861     } else if (*num == 4) { /* corner shared by four elements */
862       num++; reduce++;
863       vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
864     } else { /* general case ... odd geoms ... 3D*/
865       num++;
866       tmp = 0.0;
867       while (*map >= 0) tmp += *(vals + *map++);
868 
869       map = *reduce++;
870       while (*map >= 0) *(vals + *map++) = tmp;
871     }
872   }
873   PetscFunctionReturn(0);
874 }
875 
876 /******************************************************************************/
877 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals)
878 {
879   PetscInt    *num, *map, **reduce;
880   PetscScalar *base;
881 
882   PetscFunctionBegin;
883   num    = gs->num_gop_local_reduce;
884   reduce = gs->gop_local_reduce;
885   while ((map = *reduce++)) {
886     /* wall */
887     if (*num == 2) {
888       num++;
889       vals[map[0]] += vals[map[1]];
890     } else if (*num == 3) { /* corner shared by three elements */
891       num++;
892       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
893     } else if (*num == 4) { /* corner shared by four elements */
894       num++;
895       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
896     } else { /* general case ... odd geoms ... 3D*/
897       num++;
898       base = vals + *map++;
899       while (*map >= 0) *base += *(vals + *map++);
900     }
901   }
902   PetscFunctionReturn(0);
903 }
904 
905 /******************************************************************************/
906 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
907 {
908   PetscInt i;
909 
910   PetscFunctionBegin;
911   if (gs->nghs) free((void*) gs->nghs);
912   if (gs->pw_nghs) free((void*) gs->pw_nghs);
913 
914   /* tree */
915   if (gs->max_left_over) {
916     if (gs->tree_elms) free((void*) gs->tree_elms);
917     if (gs->tree_buf) free((void*) gs->tree_buf);
918     if (gs->tree_work) free((void*) gs->tree_work);
919     if (gs->tree_map_in) free((void*) gs->tree_map_in);
920     if (gs->tree_map_out) free((void*) gs->tree_map_out);
921   }
922 
923   /* pairwise info */
924   if (gs->num_pairs) {
925     /* should be NULL already */
926     if (gs->ngh_buf) free((void*) gs->ngh_buf);
927     if (gs->elms) free((void*) gs->elms);
928     if (gs->local_elms) free((void*) gs->local_elms);
929     if (gs->companion) free((void*) gs->companion);
930 
931     /* only set if pairwise */
932     if (gs->vals) free((void*) gs->vals);
933     if (gs->in) free((void*) gs->in);
934     if (gs->out) free((void*) gs->out);
935     if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
936     if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
937     if (gs->pw_vals) free((void*) gs->pw_vals);
938     if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
939     if (gs->node_list) {
940       for (i=0;i<gs->num_pairs;i++) {
941         if (gs->node_list[i])  {
942           free((void*) gs->node_list[i]);
943         }
944       }
945       free((void*) gs->node_list);
946     }
947     if (gs->msg_sizes) free((void*) gs->msg_sizes);
948     if (gs->pair_list) free((void*) gs->pair_list);
949   }
950 
951   /* local info */
952   if (gs->num_local_total>=0) {
953     /*      for (i=0;i<gs->num_local_total;i++) */
954     for (i=0;i<gs->num_local_total+1;i++) {
955       if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
956     }
957   }
958 
959   /* if intersection tree/pairwise and local isn't empty */
960   if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
961   if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
962 
963   free((void*) gs);
964   PetscFunctionReturn(0);
965 }
966 
967 /******************************************************************************/
968 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt step)
969 {
970   PetscErrorCode ierr;
971 
972   PetscFunctionBegin;
973   switch (*op) {
974   case '+':
975     PCTFS_gs_gop_vec_plus(gs,vals,step);
976     break;
977   default:
978     ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr);
979     ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");CHKERRQ(ierr);
980     PCTFS_gs_gop_vec_plus(gs,vals,step);
981     break;
982   }
983   PetscFunctionReturn(0);
984 }
985 
986 /******************************************************************************/
987 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
988 {
989   PetscFunctionBegin;
990   if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!");
991 
992   /* local only operations!!! */
993   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
994 
995   /* if intersection tree/pairwise and local isn't empty */
996   if (gs->num_local_gop) {
997     PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
998 
999     /* pairwise */
1000     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
1001 
1002     /* tree */
1003     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1004 
1005     PCTFS_gs_gop_vec_local_out(gs,vals,step);
1006   } else { /* if intersection tree/pairwise and local is empty */
1007     /* pairwise */
1008     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
1009 
1010     /* tree */
1011     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
1012   }
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 /******************************************************************************/
1017 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1018 {
1019   PetscInt    *num, *map, **reduce;
1020   PetscScalar *base;
1021 
1022   PetscFunctionBegin;
1023   num    = gs->num_local_reduce;
1024   reduce = gs->local_reduce;
1025   while ((map = *reduce)) {
1026     base = vals + map[0] * step;
1027 
1028     /* wall */
1029     if (*num == 2) {
1030       num++; reduce++;
1031       PCTFS_rvec_add (base,vals+map[1]*step,step);
1032       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1033     } else if (*num == 3) { /* corner shared by three elements */
1034       num++; reduce++;
1035       PCTFS_rvec_add (base,vals+map[1]*step,step);
1036       PCTFS_rvec_add (base,vals+map[2]*step,step);
1037       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1038       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1039     } else if (*num == 4) { /* corner shared by four elements */
1040       num++; reduce++;
1041       PCTFS_rvec_add (base,vals+map[1]*step,step);
1042       PCTFS_rvec_add (base,vals+map[2]*step,step);
1043       PCTFS_rvec_add (base,vals+map[3]*step,step);
1044       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1045       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1046       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1047     } else { /* general case ... odd geoms ... 3D */
1048       num++;
1049       while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1050 
1051       map = *reduce;
1052       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1053 
1054       reduce++;
1055     }
1056   }
1057   PetscFunctionReturn(0);
1058 }
1059 
1060 /******************************************************************************/
1061 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1062 {
1063   PetscInt    *num, *map, **reduce;
1064   PetscScalar *base;
1065 
1066   PetscFunctionBegin;
1067   num    = gs->num_gop_local_reduce;
1068   reduce = gs->gop_local_reduce;
1069   while ((map = *reduce++)) {
1070     base = vals + map[0] * step;
1071 
1072     /* wall */
1073     if (*num == 2) {
1074       num++;
1075       PCTFS_rvec_add(base,vals+map[1]*step,step);
1076     } else if (*num == 3) { /* corner shared by three elements */
1077       num++;
1078       PCTFS_rvec_add(base,vals+map[1]*step,step);
1079       PCTFS_rvec_add(base,vals+map[2]*step,step);
1080     } else if (*num == 4) { /* corner shared by four elements */
1081       num++;
1082       PCTFS_rvec_add(base,vals+map[1]*step,step);
1083       PCTFS_rvec_add(base,vals+map[2]*step,step);
1084       PCTFS_rvec_add(base,vals+map[3]*step,step);
1085     } else { /* general case ... odd geoms ... 3D*/
1086       num++;
1087       while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1088     }
1089   }
1090   PetscFunctionReturn(0);
1091 }
1092 
1093 /******************************************************************************/
1094 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt step)
1095 {
1096   PetscInt    *num, *map, **reduce;
1097   PetscScalar *base;
1098 
1099   PetscFunctionBegin;
1100   num    = gs->num_gop_local_reduce;
1101   reduce = gs->gop_local_reduce;
1102   while ((map = *reduce++)) {
1103     base = vals + map[0] * step;
1104 
1105     /* wall */
1106     if (*num == 2) {
1107       num++;
1108       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1109     } else if (*num == 3) { /* corner shared by three elements */
1110       num++;
1111       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1112       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1113     } else if (*num == 4) { /* corner shared by four elements */
1114       num++;
1115       PCTFS_rvec_copy(vals+map[1]*step,base,step);
1116       PCTFS_rvec_copy(vals+map[2]*step,base,step);
1117       PCTFS_rvec_copy(vals+map[3]*step,base,step);
1118     } else { /* general case ... odd geoms ... 3D*/
1119       num++;
1120       while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1121     }
1122   }
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 /******************************************************************************/
1127 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt step)
1128 {
1129   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1130   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1131   PetscInt       *pw, *list, *size, **nodes;
1132   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1133   MPI_Status     status;
1134   PetscBLASInt   i1 = 1,dstep;
1135   PetscErrorCode ierr;
1136 
1137   PetscFunctionBegin;
1138   /* strip and load s */
1139   msg_list    = list     = gs->pair_list;
1140   msg_size    = size     = gs->msg_sizes;
1141   msg_nodes   = nodes    = gs->node_list;
1142   iptr        = pw       = gs->pw_elm_list;
1143   dptr1       = dptr3    = gs->pw_vals;
1144   msg_ids_in  = ids_in   = gs->msg_ids_in;
1145   msg_ids_out = ids_out  = gs->msg_ids_out;
1146   dptr2                  = gs->out;
1147   in1=in2                = gs->in;
1148 
1149   /* post the receives */
1150   /*  msg_nodes=nodes; */
1151   do {
1152     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1153         second one *list and do list++ afterwards */
1154     ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1155     list++;msg_ids_in++;
1156     in1 += *size++ *step;
1157   } while (*++msg_nodes);
1158   msg_nodes=nodes;
1159 
1160   /* load gs values into in out gs buffers */
1161   while (*iptr >= 0) {
1162     PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1163     dptr3+=step;
1164     iptr++;
1165   }
1166 
1167   /* load out buffers and post the sends */
1168   while ((iptr = *msg_nodes++)) {
1169     dptr3 = dptr2;
1170     while (*iptr >= 0) {
1171       PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1172       dptr2+=step;
1173       iptr++;
1174     }
1175     ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1176     msg_size++; msg_list++;msg_ids_out++;
1177   }
1178 
1179   /* tree */
1180   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1181 
1182   /* process the received data */
1183   msg_nodes=nodes;
1184   while ((iptr = *nodes++)) {
1185     PetscScalar d1 = 1.0;
1186 
1187     /* Should I check the return value of MPI_Wait() or status? */
1188     /* Can this loop be replaced by a call to MPI_Waitall()? */
1189     ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1190     ids_in++;
1191     while (*iptr >= 0) {
1192       ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1193       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1194       in2+=step;
1195       iptr++;
1196     }
1197   }
1198 
1199   /* replace vals */
1200   while (*pw >= 0) {
1201     PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1202     dptr1+=step;
1203     pw++;
1204   }
1205 
1206   /* clear isend message handles */
1207   /* This changed for clarity though it could be the same */
1208 
1209   /* Should I check the return value of MPI_Wait() or status? */
1210   /* Can this loop be replaced by a call to MPI_Waitall()? */
1211   while (*msg_nodes++) {
1212     ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);
1213     ids_out++;
1214   }
1215   PetscFunctionReturn(0);
1216 }
1217 
1218 /******************************************************************************/
1219 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs,  PetscScalar *vals,  PetscInt step)
1220 {
1221   PetscInt       size, *in, *out;
1222   PetscScalar    *buf, *work;
1223   PetscInt       op[] = {GL_ADD,0};
1224   PetscBLASInt   i1   = 1;
1225   PetscErrorCode ierr;
1226   PetscBLASInt   dstep;
1227 
1228   PetscFunctionBegin;
1229   /* copy over to local variables */
1230   in   = gs->tree_map_in;
1231   out  = gs->tree_map_out;
1232   buf  = gs->tree_buf;
1233   work = gs->tree_work;
1234   size = gs->tree_nel*step;
1235 
1236   /* zero out collection buffer */
1237   PCTFS_rvec_zero(buf,size);
1238 
1239 
1240   /* copy over my contributions */
1241   while (*in >= 0) {
1242     ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1243     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1244   }
1245 
1246   /* perform fan in/out on full buffer */
1247   /* must change PCTFS_grop to handle the blas */
1248   PCTFS_grop(buf,work,size,op);
1249 
1250   /* reset */
1251   in  = gs->tree_map_in;
1252   out = gs->tree_map_out;
1253 
1254   /* get the portion of the results I need */
1255   while (*in >= 0) {
1256     ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr);
1257     PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1258   }
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /******************************************************************************/
1263 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs,  PetscScalar *vals,  const char *op,  PetscInt dim)
1264 {
1265   PetscErrorCode ierr;
1266 
1267   PetscFunctionBegin;
1268   switch (*op) {
1269   case '+':
1270     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1271     break;
1272   default:
1273     ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr);
1274     ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr);
1275     PCTFS_gs_gop_plus_hc(gs,vals,dim);
1276     break;
1277   }
1278   PetscFunctionReturn(0);
1279 }
1280 
1281 /******************************************************************************/
1282 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs,  PetscScalar *vals, PetscInt dim)
1283 {
1284   PetscFunctionBegin;
1285   /* if there's nothing to do return */
1286   if (dim<=0) PetscFunctionReturn(0);
1287 
1288   /* can't do more dimensions then exist */
1289   dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1290 
1291   /* local only operations!!! */
1292   if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1293 
1294   /* if intersection tree/pairwise and local isn't empty */
1295   if (gs->num_local_gop) {
1296     PCTFS_gs_gop_local_in_plus(gs,vals);
1297 
1298     /* pairwise will do tree inside ... */
1299     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1300     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1301 
1302     PCTFS_gs_gop_local_out(gs,vals);
1303   } else { /* if intersection tree/pairwise and local is empty */
1304     /* pairwise will do tree inside */
1305     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1306     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1307   }
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 /******************************************************************************/
1312 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs,  PetscScalar *in_vals, PetscInt dim)
1313 {
1314   PetscScalar    *dptr1, *dptr2, *dptr3, *in1, *in2;
1315   PetscInt       *iptr, *msg_list, *msg_size, **msg_nodes;
1316   PetscInt       *pw, *list, *size, **nodes;
1317   MPI_Request    *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1318   MPI_Status     status;
1319   PetscInt       i, mask=1;
1320   PetscErrorCode ierr;
1321 
1322   PetscFunctionBegin;
1323   for (i=1; i<dim; i++) { mask<<=1; mask++; }
1324 
1325   /* strip and load s */
1326   msg_list    = list     = gs->pair_list;
1327   msg_size    = size     = gs->msg_sizes;
1328   msg_nodes   = nodes    = gs->node_list;
1329   iptr        = pw       = gs->pw_elm_list;
1330   dptr1       = dptr3    = gs->pw_vals;
1331   msg_ids_in  = ids_in   = gs->msg_ids_in;
1332   msg_ids_out = ids_out  = gs->msg_ids_out;
1333   dptr2       = gs->out;
1334   in1         = in2      = gs->in;
1335 
1336   /* post the receives */
1337   /*  msg_nodes=nodes; */
1338   do {
1339     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1340         second one *list and do list++ afterwards */
1341     if ((PCTFS_my_id|mask)==(*list|mask)) {
1342       ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr);
1343       list++; msg_ids_in++;in1 += *size++;
1344     } else { list++; size++; }
1345   } while (*++msg_nodes);
1346 
1347   /* load gs values into in out gs buffers */
1348   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1349 
1350   /* load out buffers and post the sends */
1351   msg_nodes=nodes;
1352   list     = msg_list;
1353   while ((iptr = *msg_nodes++)) {
1354     if ((PCTFS_my_id|mask)==(*list|mask)) {
1355       dptr3 = dptr2;
1356       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1357       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1358       /* is msg_ids_out++ correct? */
1359       ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr);
1360       msg_size++;list++;msg_ids_out++;
1361     } else {list++; msg_size++;}
1362   }
1363 
1364   /* do the tree while we're waiting */
1365   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1366 
1367   /* process the received data */
1368   msg_nodes=nodes;
1369   list     = msg_list;
1370   while ((iptr = *nodes++)) {
1371     if ((PCTFS_my_id|mask)==(*list|mask)) {
1372       /* Should I check the return value of MPI_Wait() or status? */
1373       /* Can this loop be replaced by a call to MPI_Waitall()? */
1374       ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr);
1375       ids_in++;
1376       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1377     }
1378     list++;
1379   }
1380 
1381   /* replace vals */
1382   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1383 
1384   /* clear isend message handles */
1385   /* This changed for clarity though it could be the same */
1386   while (*msg_nodes++) {
1387     if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1388       /* Should I check the return value of MPI_Wait() or status? */
1389       /* Can this loop be replaced by a call to MPI_Waitall()? */
1390       ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);
1391       ids_out++;
1392     }
1393     msg_list++;
1394   }
1395   PetscFunctionReturn(0);
1396 }
1397 
1398 /******************************************************************************/
1399 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1400 {
1401   PetscInt    size;
1402   PetscInt    *in, *out;
1403   PetscScalar *buf, *work;
1404   PetscInt    op[] = {GL_ADD,0};
1405 
1406   PetscFunctionBegin;
1407   in   = gs->tree_map_in;
1408   out  = gs->tree_map_out;
1409   buf  = gs->tree_buf;
1410   work = gs->tree_work;
1411   size = gs->tree_nel;
1412 
1413   PCTFS_rvec_zero(buf,size);
1414 
1415   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1416 
1417   in  = gs->tree_map_in;
1418   out = gs->tree_map_out;
1419 
1420   PCTFS_grop_hc(buf,work,size,op,dim);
1421 
1422   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1423   PetscFunctionReturn(0);
1424 }
1425 
1426 
1427 
1428