xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision 58d68138c660dfb4e9f5b03334792cd4f2ffd7cc)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 /***********************************gs.c***************************************
30 Type: struct gather_scatter_id
31 ------------------------------
32 
33 ************************************gs.c**************************************/
34 typedef struct gather_scatter_id {
35   PetscInt     id;
36   PetscInt     nel_min;
37   PetscInt     nel_max;
38   PetscInt     nel_sum;
39   PetscInt     negl;
40   PetscInt     gl_max;
41   PetscInt     gl_min;
42   PetscInt     repeats;
43   PetscInt     ordered;
44   PetscInt     positive;
45   PetscScalar *vals;
46 
47   /* bit mask info */
48   PetscInt *my_proc_mask;
49   PetscInt  mask_sz;
50   PetscInt *ngh_buf;
51   PetscInt  ngh_buf_sz;
52   PetscInt *nghs;
53   PetscInt  num_nghs;
54   PetscInt  max_nghs;
55   PetscInt *pw_nghs;
56   PetscInt  num_pw_nghs;
57   PetscInt *tree_nghs;
58   PetscInt  num_tree_nghs;
59 
60   PetscInt num_loads;
61 
62   /* repeats == true -> local info */
63   PetscInt  nel;  /* number of unique elememts */
64   PetscInt *elms; /* of size nel */
65   PetscInt  nel_total;
66   PetscInt *local_elms; /* of size nel_total */
67   PetscInt *companion;  /* of size nel_total */
68 
69   /* local info */
70   PetscInt   num_local_total;
71   PetscInt   local_strength;
72   PetscInt   num_local;
73   PetscInt  *num_local_reduce;
74   PetscInt **local_reduce;
75   PetscInt   num_local_gop;
76   PetscInt  *num_gop_local_reduce;
77   PetscInt **gop_local_reduce;
78 
79   /* pairwise info */
80   PetscInt     level;
81   PetscInt     num_pairs;
82   PetscInt     max_pairs;
83   PetscInt     loc_node_pairs;
84   PetscInt     max_node_pairs;
85   PetscInt     min_node_pairs;
86   PetscInt     avg_node_pairs;
87   PetscInt    *pair_list;
88   PetscInt    *msg_sizes;
89   PetscInt   **node_list;
90   PetscInt     len_pw_list;
91   PetscInt    *pw_elm_list;
92   PetscScalar *pw_vals;
93 
94   MPI_Request *msg_ids_in;
95   MPI_Request *msg_ids_out;
96 
97   PetscScalar *out;
98   PetscScalar *in;
99   PetscInt     msg_total;
100 
101   /* tree - crystal accumulator info */
102   PetscInt   max_left_over;
103   PetscInt  *pre;
104   PetscInt  *in_num;
105   PetscInt  *out_num;
106   PetscInt **in_list;
107   PetscInt **out_list;
108 
109   /* new tree work*/
110   PetscInt     tree_nel;
111   PetscInt    *tree_elms;
112   PetscScalar *tree_buf;
113   PetscScalar *tree_work;
114 
115   PetscInt  tree_map_sz;
116   PetscInt *tree_map_in;
117   PetscInt *tree_map_out;
118 
119   /* current memory status */
120   PetscInt gl_bss_min;
121   PetscInt gl_perm_min;
122 
123   /* max segment size for PCTFS_gs_gop_vec() */
124   PetscInt vec_sz;
125 
126   /* hack to make paul happy */
127   MPI_Comm PCTFS_gs_comm;
128 
129 } PCTFS_gs_id;
130 
131 static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135 static PCTFS_gs_id   *gsi_new(void);
136 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
137 
138 /* same for all but vector flavor */
139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140 /* vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 
143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 
149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
151 
152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
155 
156 /* global vars */
157 /* from comm.c module */
158 
159 static PetscInt num_gs_ids = 0;
160 
161 /* should make this dynamic ... later */
162 static PetscInt  msg_buf     = MAX_MSG_BUF;
163 static PetscInt  vec_sz      = GS_VEC_SZ;
164 static PetscInt *tree_buf    = NULL;
165 static PetscInt  tree_buf_sz = 0;
166 static PetscInt  ntree       = 0;
167 
168 /***************************************************************************/
169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) {
170   PetscFunctionBegin;
171   vec_sz = size;
172   PetscFunctionReturn(0);
173 }
174 
175 /******************************************************************************/
176 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) {
177   PetscFunctionBegin;
178   msg_buf = buf_size;
179   PetscFunctionReturn(0);
180 }
181 
182 /******************************************************************************/
183 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) {
184   PCTFS_gs_id *gs;
185   MPI_Group    PCTFS_gs_group;
186   MPI_Comm     PCTFS_gs_comm;
187 
188   /* ensure that communication package has been initialized */
189   PCTFS_comm_init();
190 
191   /* determines if we have enough dynamic/semi-static memory */
192   /* checks input, allocs and sets gd_id template            */
193   gs = gsi_check_args(elms, nel, level);
194 
195   /* only bit mask version up and working for the moment    */
196   /* LATER :: get int list version working for sparse pblms */
197   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
198 
199   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group));
200   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm));
201   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group));
202 
203   gs->PCTFS_gs_comm = PCTFS_gs_comm;
204 
205   return (gs);
206 }
207 
208 /******************************************************************************/
209 static PCTFS_gs_id *gsi_new(void) {
210   PCTFS_gs_id *gs;
211   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
212   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
213   return (gs);
214 }
215 
216 /******************************************************************************/
217 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) {
218   PetscInt     i, j, k, t2;
219   PetscInt    *companion, *elms, *unique, *iptr;
220   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
221   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
222   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
223   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
224   PCTFS_gs_id *gs;
225 
226   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
227   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
228 
229   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
230 
231   /* get space for gs template */
232   gs     = gsi_new();
233   gs->id = ++num_gs_ids;
234 
235   /* hmt 6.4.99                                            */
236   /* caller can set global ids that don't participate to 0 */
237   /* PCTFS_gs_init ignores all zeros in elm list                 */
238   /* negative global ids are still invalid                 */
239   for (i = j = 0; i < nel; i++) {
240     if (in_elms[i] != 0) j++;
241   }
242 
243   k   = nel;
244   nel = j;
245 
246   /* copy over in_elms list and create inverse map */
247   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
248   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
249 
250   for (i = j = 0; i < k; i++) {
251     if (in_elms[i] != 0) {
252       elms[j]        = in_elms[i];
253       companion[j++] = i;
254     }
255   }
256 
257   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
258 
259   /* pre-pass ... check to see if sorted */
260   elms[nel] = INT_MAX;
261   iptr      = elms;
262   unique    = elms + 1;
263   j         = 0;
264   while (*iptr != INT_MAX) {
265     if (*iptr++ > *unique++) {
266       j = 1;
267       break;
268     }
269   }
270 
271   /* set up inverse map */
272   if (j) {
273     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
274     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
275   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
276   elms[nel] = INT_MIN;
277 
278   /* first pass */
279   /* determine number of unique elements, check pd */
280   for (i = k = 0; i < nel; i += j) {
281     t2 = elms[i];
282     j  = ++i;
283 
284     /* clump 'em for now */
285     while (elms[j] == t2) j++;
286 
287     /* how many together and num local */
288     if (j -= i) {
289       num_local++;
290       k += j;
291     }
292   }
293 
294   /* how many unique elements? */
295   gs->repeats = k;
296   gs->nel     = nel - k;
297 
298   /* number of repeats? */
299   gs->num_local = num_local;
300   num_local += 2;
301   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
302   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
303 
304   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
305   gs->elms       = unique;
306   gs->nel_total  = nel;
307   gs->local_elms = elms;
308   gs->companion  = companion;
309 
310   /* compess map as well as keep track of local ops */
311   for (num_local = i = j = 0; i < gs->nel; i++) {
312     k  = j;
313     t2 = unique[i] = elms[j];
314     companion[i]   = companion[j];
315 
316     while (elms[j] == t2) j++;
317 
318     if ((t2 = (j - k)) > 1) {
319       /* number together */
320       num_to_reduce[num_local] = t2++;
321 
322       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
323 
324       /* to use binary searching don't remap until we check intersection */
325       *iptr++ = i;
326 
327       /* note that we're skipping the first one */
328       while (++k < j) *(iptr++) = companion[k];
329       *iptr = -1;
330     }
331   }
332 
333   /* sentinel for ngh_buf */
334   unique[gs->nel] = INT_MAX;
335 
336   /* for two partition sort hack */
337   num_to_reduce[num_local]   = 0;
338   local_reduce[num_local]    = NULL;
339   num_to_reduce[++num_local] = 0;
340   local_reduce[num_local]    = NULL;
341 
342   /* load 'em up */
343   /* note one extra to hold NON_UNIFORM flag!!! */
344   vals[2] = vals[1] = vals[0] = nel;
345   if (gs->nel > 0) {
346     vals[3] = unique[0];
347     vals[4] = unique[gs->nel - 1];
348   } else {
349     vals[3] = INT_MAX;
350     vals[4] = INT_MIN;
351   }
352   vals[5] = level;
353   vals[6] = num_gs_ids;
354 
355   /* GLOBAL: send 'em out */
356   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
357 
358   /* must be semi-pos def - only pairwise depends on this */
359   /* LATER - remove this restriction */
360   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
361   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
362 
363   gs->nel_min = vals[0];
364   gs->nel_max = vals[1];
365   gs->nel_sum = vals[2];
366   gs->gl_min  = vals[3];
367   gs->gl_max  = vals[4];
368   gs->negl    = vals[4] - vals[3] + 1;
369 
370   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
371 
372   /* LATER :: add level == -1 -> program selects level */
373   if (vals[5] < 0) vals[5] = 0;
374   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
375   gs->level = vals[5];
376 
377   return (gs);
378 }
379 
380 /******************************************************************************/
381 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) {
382   PetscInt   i, nel, *elms;
383   PetscInt   t1;
384   PetscInt **reduce;
385   PetscInt  *map;
386 
387   PetscFunctionBegin;
388   /* totally local removes ... PCTFS_ct_bits == 0 */
389   get_ngh_buf(gs);
390 
391   if (gs->level) set_pairwise(gs);
392   if (gs->max_left_over) set_tree(gs);
393 
394   /* intersection local and pairwise/tree? */
395   gs->num_local_total      = gs->num_local;
396   gs->gop_local_reduce     = gs->local_reduce;
397   gs->num_gop_local_reduce = gs->num_local_reduce;
398 
399   map = gs->companion;
400 
401   /* is there any local compression */
402   if (!gs->num_local) {
403     gs->local_strength = NONE;
404     gs->num_local_gop  = 0;
405   } else {
406     /* ok find intersection */
407     map    = gs->companion;
408     reduce = gs->local_reduce;
409     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
410       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
411         t1++;
412         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
413         gs->num_local_reduce[i] *= -1;
414       }
415       **reduce = map[**reduce];
416     }
417 
418     /* intersection is empty */
419     if (!t1) {
420       gs->local_strength = FULL;
421       gs->num_local_gop  = 0;
422     } else { /* intersection not empty */
423       gs->local_strength = PARTIAL;
424 
425       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
426 
427       gs->num_local_gop   = t1;
428       gs->num_local_total = gs->num_local;
429       gs->num_local -= t1;
430       gs->gop_local_reduce     = gs->local_reduce;
431       gs->num_gop_local_reduce = gs->num_local_reduce;
432 
433       for (i = 0; i < t1; i++) {
434         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
435         gs->num_gop_local_reduce[i] *= -1;
436         gs->local_reduce++;
437         gs->num_local_reduce++;
438       }
439       gs->local_reduce++;
440       gs->num_local_reduce++;
441     }
442   }
443 
444   elms = gs->pw_elm_list;
445   nel  = gs->len_pw_list;
446   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
447 
448   elms = gs->tree_map_in;
449   nel  = gs->tree_map_sz;
450   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
451 
452   /* clean up */
453   free((void *)gs->local_elms);
454   free((void *)gs->companion);
455   free((void *)gs->elms);
456   free((void *)gs->ngh_buf);
457   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
458   PetscFunctionReturn(0);
459 }
460 
461 /******************************************************************************/
462 static PetscErrorCode place_in_tree(PetscInt elm) {
463   PetscInt *tp, n;
464 
465   PetscFunctionBegin;
466   if (ntree == tree_buf_sz) {
467     if (tree_buf_sz) {
468       tp = tree_buf;
469       n  = tree_buf_sz;
470       tree_buf_sz <<= 1;
471       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
472       PCTFS_ivec_copy(tree_buf, tp, n);
473       free(tp);
474     } else {
475       tree_buf_sz = TREE_BUF_SZ;
476       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
477     }
478   }
479 
480   tree_buf[ntree++] = elm;
481   PetscFunctionReturn(0);
482 }
483 
484 /******************************************************************************/
485 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) {
486   PetscInt  i, j, npw = 0, ntree_map = 0;
487   PetscInt  p_mask_size, ngh_buf_size, buf_size;
488   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
489   PetscInt *ngh_buf, *buf1, *buf2;
490   PetscInt  offset, per_load, num_loads, or_ct, start, end;
491   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
492   PetscInt  oper = GL_B_OR;
493   PetscInt *ptr3, *t_mask, level, ct1, ct2;
494 
495   PetscFunctionBegin;
496   /* to make life easier */
497   nel   = gs->nel;
498   elms  = gs->elms;
499   level = gs->level;
500 
501   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
502   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
503   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
504 
505   /* allocate space for masks and info bufs */
506   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
507   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
508   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
509   t_mask                        = (PetscInt *)malloc(p_mask_size);
510   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
511 
512   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
513   /* had thought I could exploit rendezvous threshold */
514 
515   /* default is one pass */
516   per_load = negl = gs->negl;
517   gs->num_loads = num_loads = 1;
518   i                         = p_mask_size * negl;
519 
520   /* possible overflow on buffer size */
521   /* overflow hack                    */
522   if (i < 0) i = INT_MAX;
523 
524   buf_size = PetscMin(msg_buf, i);
525 
526   /* can we do it? */
527   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
528 
529   /* get PCTFS_giop buf space ... make *only* one malloc */
530   buf1 = (PetscInt *)malloc(buf_size << 1);
531 
532   /* more than one gior exchange needed? */
533   if (buf_size != i) {
534     per_load      = buf_size / p_mask_size;
535     buf_size      = per_load * p_mask_size;
536     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
537   }
538 
539   /* convert buf sizes from #bytes to #ints - 32 bit only! */
540   p_mask_size /= sizeof(PetscInt);
541   ngh_buf_size /= sizeof(PetscInt);
542   buf_size /= sizeof(PetscInt);
543 
544   /* find PCTFS_giop work space */
545   buf2 = buf1 + buf_size;
546 
547   /* hold #ints needed for processor masks */
548   gs->mask_sz = p_mask_size;
549 
550   /* init buffers */
551   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
552   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
553   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
554 
555   /* HACK reset tree info */
556   tree_buf    = NULL;
557   tree_buf_sz = ntree = 0;
558 
559   /* ok do it */
560   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
561     /* identity for bitwise or is 000...000 */
562     PCTFS_ivec_zero(buf1, buf_size);
563 
564     /* load msg buffer */
565     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
566       offset = (offset - start) * p_mask_size;
567       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
568     }
569 
570     /* GLOBAL: pass buffer */
571     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
572 
573     /* unload buffer into ngh_buf */
574     ptr2 = (elms + i_start);
575     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
576       /* I own it ... may have to pairwise it */
577       if (j == *ptr2) {
578         /* do i share it w/anyone? */
579         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
580         /* guess not */
581         if (ct1 < 2) {
582           ptr2++;
583           ptr1 += p_mask_size;
584           continue;
585         }
586 
587         /* i do ... so keep info and turn off my bit */
588         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
589         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
590         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
591 
592         /* is it to be done pairwise? */
593         if (--ct1 <= level) {
594           npw++;
595 
596           /* turn on high bit to indicate pw need to process */
597           *ptr2++ |= TOP_BIT;
598           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
599           ptr1 += p_mask_size;
600           continue;
601         }
602 
603         /* get set for next and note that I have a tree contribution */
604         /* could save exact elm index for tree here -> save a search */
605         ptr2++;
606         ptr1 += p_mask_size;
607         ntree_map++;
608       } else { /* i don't but still might be involved in tree */
609 
610         /* shared by how many? */
611         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
612 
613         /* none! */
614         if (ct1 < 2) continue;
615 
616         /* is it going to be done pairwise? but not by me of course!*/
617         if (--ct1 <= level) continue;
618       }
619       /* LATER we're going to have to process it NOW */
620       /* nope ... tree it */
621       PetscCall(place_in_tree(j));
622     }
623   }
624 
625   free((void *)t_mask);
626   free((void *)buf1);
627 
628   gs->len_pw_list = npw;
629   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
630 
631   /* expand from bit mask list to int list and save ngh list */
632   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
633   PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs);
634 
635   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
636 
637   oper = GL_MAX;
638   ct1  = gs->num_nghs;
639   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
640   gs->max_nghs = ct1;
641 
642   gs->tree_map_sz   = ntree_map;
643   gs->max_left_over = ntree;
644 
645   free((void *)p_mask);
646   free((void *)sh_proc_mask);
647   PetscFunctionReturn(0);
648 }
649 
650 /******************************************************************************/
651 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) {
652   PetscInt  i, j;
653   PetscInt  p_mask_size;
654   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
655   PetscInt *ngh_buf, *buf2;
656   PetscInt  offset;
657   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
658   PetscInt *pairwise_elm_list, len_pair_list = 0;
659   PetscInt *iptr, t1, i_start, nel, *elms;
660   PetscInt  ct;
661 
662   PetscFunctionBegin;
663   /* to make life easier */
664   nel          = gs->nel;
665   elms         = gs->elms;
666   ngh_buf      = gs->ngh_buf;
667   sh_proc_mask = gs->pw_nghs;
668 
669   /* need a few temp masks */
670   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
671   p_mask        = (PetscInt *)malloc(p_mask_size);
672   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
673 
674   /* set mask to my PCTFS_my_id's bit mask */
675   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
676 
677   p_mask_size /= sizeof(PetscInt);
678 
679   len_pair_list   = gs->len_pw_list;
680   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
681 
682   /* how many processors (nghs) do we have to exchange with? */
683   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
684 
685   /* allocate space for PCTFS_gs_gop() info */
686   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
687   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
688   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
689 
690   /* init msg_size list */
691   PetscCall(PCTFS_ivec_zero(msg_size, nprs));
692 
693   /* expand from bit mask list to int list */
694   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
695 
696   /* keep list of elements being handled pairwise */
697   for (i = j = 0; i < nel; i++) {
698     if (elms[i] & TOP_BIT) {
699       elms[i] ^= TOP_BIT;
700       pairwise_elm_list[j++] = i;
701     }
702   }
703   pairwise_elm_list[j] = -1;
704 
705   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
706   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
707   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
708   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
709   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
710 
711   /* find who goes to each processor */
712   for (i_start = i = 0; i < nprs; i++) {
713     /* processor i's mask */
714     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
715 
716     /* det # going to processor i */
717     for (ct = j = 0; j < len_pair_list; j++) {
718       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
719       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
720       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
721     }
722     msg_size[i] = ct;
723     i_start     = PetscMax(i_start, ct);
724 
725     /*space to hold nodes in message to first neighbor */
726     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
727 
728     for (j = 0; j < len_pair_list; j++) {
729       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
730       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
731       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
732     }
733     *iptr = -1;
734   }
735   msg_nodes[nprs] = NULL;
736 
737   j = gs->loc_node_pairs = i_start;
738   t1                     = GL_MAX;
739   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
740   gs->max_node_pairs = i_start;
741 
742   i_start = j;
743   t1      = GL_MIN;
744   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
745   gs->min_node_pairs = i_start;
746 
747   i_start = j;
748   t1      = GL_ADD;
749   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
750   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
751 
752   i_start = nprs;
753   t1      = GL_MAX;
754   PCTFS_giop(&i_start, &offset, 1, &t1);
755   gs->max_pairs = i_start;
756 
757   /* remap pairwise in tail of gsi_via_bit_mask() */
758   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
759   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
760   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
761 
762   /* reset malloc pool */
763   free((void *)p_mask);
764   free((void *)tmp_proc_mask);
765   PetscFunctionReturn(0);
766 }
767 
768 /* to do pruned tree just save ngh buf copy for each one and decode here!
769 ******************************************************************************/
770 static PetscErrorCode set_tree(PCTFS_gs_id *gs) {
771   PetscInt  i, j, n, nel;
772   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
773 
774   PetscFunctionBegin;
775   /* local work ptrs */
776   elms = gs->elms;
777   nel  = gs->nel;
778 
779   /* how many via tree */
780   gs->tree_nel = n = ntree;
781   gs->tree_elms = tree_elms = iptr_in = tree_buf;
782   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
783   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
784   j                                   = gs->tree_map_sz;
785   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
786   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
787 
788   /* search the longer of the two lists */
789   /* note ... could save this info in get_ngh_buf and save searches */
790   if (n <= nel) {
791     /* bijective fct w/remap - search elm list */
792     for (i = 0; i < n; i++) {
793       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
794         *iptr_in++  = j;
795         *iptr_out++ = i;
796       }
797     }
798   } else {
799     for (i = 0; i < nel; i++) {
800       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
801         *iptr_in++  = i;
802         *iptr_out++ = j;
803       }
804     }
805   }
806 
807   /* sentinel */
808   *iptr_in = *iptr_out = -1;
809   PetscFunctionReturn(0);
810 }
811 
812 /******************************************************************************/
813 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) {
814   PetscInt   *num, *map, **reduce;
815   PetscScalar tmp;
816 
817   PetscFunctionBegin;
818   num    = gs->num_gop_local_reduce;
819   reduce = gs->gop_local_reduce;
820   while ((map = *reduce++)) {
821     /* wall */
822     if (*num == 2) {
823       num++;
824       vals[map[1]] = vals[map[0]];
825     } else if (*num == 3) { /* corner shared by three elements */
826       num++;
827       vals[map[2]] = vals[map[1]] = vals[map[0]];
828     } else if (*num == 4) { /* corner shared by four elements */
829       num++;
830       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
831     } else { /* general case ... odd geoms ... 3D*/
832       num++;
833       tmp = *(vals + *map++);
834       while (*map >= 0) *(vals + *map++) = tmp;
835     }
836   }
837   PetscFunctionReturn(0);
838 }
839 
840 /******************************************************************************/
841 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) {
842   PetscInt   *num, *map, **reduce;
843   PetscScalar tmp;
844 
845   PetscFunctionBegin;
846   num    = gs->num_local_reduce;
847   reduce = gs->local_reduce;
848   while ((map = *reduce)) {
849     /* wall */
850     if (*num == 2) {
851       num++;
852       reduce++;
853       vals[map[1]] = vals[map[0]] += vals[map[1]];
854     } else if (*num == 3) { /* corner shared by three elements */
855       num++;
856       reduce++;
857       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
858     } else if (*num == 4) { /* corner shared by four elements */
859       num++;
860       reduce++;
861       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
862     } else { /* general case ... odd geoms ... 3D*/
863       num++;
864       tmp = 0.0;
865       while (*map >= 0) tmp += *(vals + *map++);
866 
867       map = *reduce++;
868       while (*map >= 0) *(vals + *map++) = tmp;
869     }
870   }
871   PetscFunctionReturn(0);
872 }
873 
874 /******************************************************************************/
875 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) {
876   PetscInt    *num, *map, **reduce;
877   PetscScalar *base;
878 
879   PetscFunctionBegin;
880   num    = gs->num_gop_local_reduce;
881   reduce = gs->gop_local_reduce;
882   while ((map = *reduce++)) {
883     /* wall */
884     if (*num == 2) {
885       num++;
886       vals[map[0]] += vals[map[1]];
887     } else if (*num == 3) { /* corner shared by three elements */
888       num++;
889       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
890     } else if (*num == 4) { /* corner shared by four elements */
891       num++;
892       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
893     } else { /* general case ... odd geoms ... 3D*/
894       num++;
895       base = vals + *map++;
896       while (*map >= 0) *base += *(vals + *map++);
897     }
898   }
899   PetscFunctionReturn(0);
900 }
901 
902 /******************************************************************************/
903 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) {
904   PetscInt i;
905 
906   PetscFunctionBegin;
907   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
908   if (gs->nghs) free((void *)gs->nghs);
909   if (gs->pw_nghs) free((void *)gs->pw_nghs);
910 
911   /* tree */
912   if (gs->max_left_over) {
913     if (gs->tree_elms) free((void *)gs->tree_elms);
914     if (gs->tree_buf) free((void *)gs->tree_buf);
915     if (gs->tree_work) free((void *)gs->tree_work);
916     if (gs->tree_map_in) free((void *)gs->tree_map_in);
917     if (gs->tree_map_out) free((void *)gs->tree_map_out);
918   }
919 
920   /* pairwise info */
921   if (gs->num_pairs) {
922     /* should be NULL already */
923     if (gs->ngh_buf) free((void *)gs->ngh_buf);
924     if (gs->elms) free((void *)gs->elms);
925     if (gs->local_elms) free((void *)gs->local_elms);
926     if (gs->companion) free((void *)gs->companion);
927 
928     /* only set if pairwise */
929     if (gs->vals) free((void *)gs->vals);
930     if (gs->in) free((void *)gs->in);
931     if (gs->out) free((void *)gs->out);
932     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
933     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
934     if (gs->pw_vals) free((void *)gs->pw_vals);
935     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
936     if (gs->node_list) {
937       for (i = 0; i < gs->num_pairs; i++) {
938         if (gs->node_list[i]) { free((void *)gs->node_list[i]); }
939       }
940       free((void *)gs->node_list);
941     }
942     if (gs->msg_sizes) free((void *)gs->msg_sizes);
943     if (gs->pair_list) free((void *)gs->pair_list);
944   }
945 
946   /* local info */
947   if (gs->num_local_total >= 0) {
948     for (i = 0; i < gs->num_local_total + 1; i++) {
949       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
950     }
951   }
952 
953   /* if intersection tree/pairwise and local isn't empty */
954   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
955   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
956 
957   free((void *)gs);
958   PetscFunctionReturn(0);
959 }
960 
961 /******************************************************************************/
962 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) {
963   PetscFunctionBegin;
964   switch (*op) {
965   case '+': PCTFS_gs_gop_vec_plus(gs, vals, step); break;
966   default:
967     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
968     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
969     PCTFS_gs_gop_vec_plus(gs, vals, step);
970     break;
971   }
972   PetscFunctionReturn(0);
973 }
974 
975 /******************************************************************************/
976 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) {
977   PetscFunctionBegin;
978   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
979 
980   /* local only operations!!! */
981   if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs, vals, step);
982 
983   /* if intersection tree/pairwise and local isn't empty */
984   if (gs->num_local_gop) {
985     PCTFS_gs_gop_vec_local_in_plus(gs, vals, step);
986 
987     /* pairwise */
988     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);
989 
990     /* tree */
991     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);
992 
993     PCTFS_gs_gop_vec_local_out(gs, vals, step);
994   } else { /* if intersection tree/pairwise and local is empty */
995     /* pairwise */
996     if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step);
997 
998     /* tree */
999     else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step);
1000   }
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 /******************************************************************************/
1005 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) {
1006   PetscInt    *num, *map, **reduce;
1007   PetscScalar *base;
1008 
1009   PetscFunctionBegin;
1010   num    = gs->num_local_reduce;
1011   reduce = gs->local_reduce;
1012   while ((map = *reduce)) {
1013     base = vals + map[0] * step;
1014 
1015     /* wall */
1016     if (*num == 2) {
1017       num++;
1018       reduce++;
1019       PCTFS_rvec_add(base, vals + map[1] * step, step);
1020       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1021     } else if (*num == 3) { /* corner shared by three elements */
1022       num++;
1023       reduce++;
1024       PCTFS_rvec_add(base, vals + map[1] * step, step);
1025       PCTFS_rvec_add(base, vals + map[2] * step, step);
1026       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1027       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1028     } else if (*num == 4) { /* corner shared by four elements */
1029       num++;
1030       reduce++;
1031       PCTFS_rvec_add(base, vals + map[1] * step, step);
1032       PCTFS_rvec_add(base, vals + map[2] * step, step);
1033       PCTFS_rvec_add(base, vals + map[3] * step, step);
1034       PCTFS_rvec_copy(vals + map[3] * step, base, step);
1035       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1036       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1037     } else { /* general case ... odd geoms ... 3D */
1038       num++;
1039       while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);
1040 
1041       map = *reduce;
1042       while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);
1043 
1044       reduce++;
1045     }
1046   }
1047   PetscFunctionReturn(0);
1048 }
1049 
1050 /******************************************************************************/
1051 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) {
1052   PetscInt    *num, *map, **reduce;
1053   PetscScalar *base;
1054 
1055   PetscFunctionBegin;
1056   num    = gs->num_gop_local_reduce;
1057   reduce = gs->gop_local_reduce;
1058   while ((map = *reduce++)) {
1059     base = vals + map[0] * step;
1060 
1061     /* wall */
1062     if (*num == 2) {
1063       num++;
1064       PCTFS_rvec_add(base, vals + map[1] * step, step);
1065     } else if (*num == 3) { /* corner shared by three elements */
1066       num++;
1067       PCTFS_rvec_add(base, vals + map[1] * step, step);
1068       PCTFS_rvec_add(base, vals + map[2] * step, step);
1069     } else if (*num == 4) { /* corner shared by four elements */
1070       num++;
1071       PCTFS_rvec_add(base, vals + map[1] * step, step);
1072       PCTFS_rvec_add(base, vals + map[2] * step, step);
1073       PCTFS_rvec_add(base, vals + map[3] * step, step);
1074     } else { /* general case ... odd geoms ... 3D*/
1075       num++;
1076       while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step);
1077     }
1078   }
1079   PetscFunctionReturn(0);
1080 }
1081 
1082 /******************************************************************************/
1083 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) {
1084   PetscInt    *num, *map, **reduce;
1085   PetscScalar *base;
1086 
1087   PetscFunctionBegin;
1088   num    = gs->num_gop_local_reduce;
1089   reduce = gs->gop_local_reduce;
1090   while ((map = *reduce++)) {
1091     base = vals + map[0] * step;
1092 
1093     /* wall */
1094     if (*num == 2) {
1095       num++;
1096       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1097     } else if (*num == 3) { /* corner shared by three elements */
1098       num++;
1099       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1100       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1101     } else if (*num == 4) { /* corner shared by four elements */
1102       num++;
1103       PCTFS_rvec_copy(vals + map[1] * step, base, step);
1104       PCTFS_rvec_copy(vals + map[2] * step, base, step);
1105       PCTFS_rvec_copy(vals + map[3] * step, base, step);
1106     } else { /* general case ... odd geoms ... 3D*/
1107       num++;
1108       while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step);
1109     }
1110   }
1111   PetscFunctionReturn(0);
1112 }
1113 
1114 /******************************************************************************/
1115 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) {
1116   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1117   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1118   PetscInt    *pw, *list, *size, **nodes;
1119   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1120   MPI_Status   status;
1121   PetscBLASInt i1 = 1, dstep;
1122 
1123   PetscFunctionBegin;
1124   /* strip and load s */
1125   msg_list = list = gs->pair_list;
1126   msg_size = size = gs->msg_sizes;
1127   msg_nodes = nodes = gs->node_list;
1128   iptr = pw = gs->pw_elm_list;
1129   dptr1 = dptr3 = gs->pw_vals;
1130   msg_ids_in = ids_in = gs->msg_ids_in;
1131   msg_ids_out = ids_out = gs->msg_ids_out;
1132   dptr2                 = gs->out;
1133   in1 = in2 = gs->in;
1134 
1135   /* post the receives */
1136   /*  msg_nodes=nodes; */
1137   do {
1138     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1139         second one *list and do list++ afterwards */
1140     PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1141     list++;
1142     msg_ids_in++;
1143     in1 += *size++ * step;
1144   } while (*++msg_nodes);
1145   msg_nodes = nodes;
1146 
1147   /* load gs values into in out gs buffers */
1148   while (*iptr >= 0) {
1149     PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step);
1150     dptr3 += step;
1151     iptr++;
1152   }
1153 
1154   /* load out buffers and post the sends */
1155   while ((iptr = *msg_nodes++)) {
1156     dptr3 = dptr2;
1157     while (*iptr >= 0) {
1158       PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step);
1159       dptr2 += step;
1160       iptr++;
1161     }
1162     PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1163     msg_size++;
1164     msg_list++;
1165     msg_ids_out++;
1166   }
1167 
1168   /* tree */
1169   if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step);
1170 
1171   /* process the received data */
1172   msg_nodes = nodes;
1173   while ((iptr = *nodes++)) {
1174     PetscScalar d1 = 1.0;
1175 
1176     /* Should I check the return value of MPI_Wait() or status? */
1177     /* Can this loop be replaced by a call to MPI_Waitall()? */
1178     PetscCallMPI(MPI_Wait(ids_in, &status));
1179     ids_in++;
1180     while (*iptr >= 0) {
1181       PetscCall(PetscBLASIntCast(step, &dstep));
1182       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1183       in2 += step;
1184       iptr++;
1185     }
1186   }
1187 
1188   /* replace vals */
1189   while (*pw >= 0) {
1190     PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step);
1191     dptr1 += step;
1192     pw++;
1193   }
1194 
1195   /* clear isend message handles */
1196   /* This changed for clarity though it could be the same */
1197 
1198   /* Should I check the return value of MPI_Wait() or status? */
1199   /* Can this loop be replaced by a call to MPI_Waitall()? */
1200   while (*msg_nodes++) {
1201     PetscCallMPI(MPI_Wait(ids_out, &status));
1202     ids_out++;
1203   }
1204   PetscFunctionReturn(0);
1205 }
1206 
1207 /******************************************************************************/
1208 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) {
1209   PetscInt     size, *in, *out;
1210   PetscScalar *buf, *work;
1211   PetscInt     op[] = {GL_ADD, 0};
1212   PetscBLASInt i1   = 1;
1213   PetscBLASInt dstep;
1214 
1215   PetscFunctionBegin;
1216   /* copy over to local variables */
1217   in   = gs->tree_map_in;
1218   out  = gs->tree_map_out;
1219   buf  = gs->tree_buf;
1220   work = gs->tree_work;
1221   size = gs->tree_nel * step;
1222 
1223   /* zero out collection buffer */
1224   PCTFS_rvec_zero(buf, size);
1225 
1226   /* copy over my contributions */
1227   while (*in >= 0) {
1228     PetscCall(PetscBLASIntCast(step, &dstep));
1229     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1230   }
1231 
1232   /* perform fan in/out on full buffer */
1233   /* must change PCTFS_grop to handle the blas */
1234   PCTFS_grop(buf, work, size, op);
1235 
1236   /* reset */
1237   in  = gs->tree_map_in;
1238   out = gs->tree_map_out;
1239 
1240   /* get the portion of the results I need */
1241   while (*in >= 0) {
1242     PetscCall(PetscBLASIntCast(step, &dstep));
1243     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1244   }
1245   PetscFunctionReturn(0);
1246 }
1247 
1248 /******************************************************************************/
1249 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) {
1250   PetscFunctionBegin;
1251   switch (*op) {
1252   case '+': PCTFS_gs_gop_plus_hc(gs, vals, dim); break;
1253   default:
1254     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1255     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1256     PCTFS_gs_gop_plus_hc(gs, vals, dim);
1257     break;
1258   }
1259   PetscFunctionReturn(0);
1260 }
1261 
1262 /******************************************************************************/
1263 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) {
1264   PetscFunctionBegin;
1265   /* if there's nothing to do return */
1266   if (dim <= 0) PetscFunctionReturn(0);
1267 
1268   /* can't do more dimensions then exist */
1269   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1270 
1271   /* local only operations!!! */
1272   if (gs->num_local) PCTFS_gs_gop_local_plus(gs, vals);
1273 
1274   /* if intersection tree/pairwise and local isn't empty */
1275   if (gs->num_local_gop) {
1276     PCTFS_gs_gop_local_in_plus(gs, vals);
1277 
1278     /* pairwise will do tree inside ... */
1279     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree only */
1280     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);
1281 
1282     PCTFS_gs_gop_local_out(gs, vals);
1283   } else { /* if intersection tree/pairwise and local is empty */
1284     /* pairwise will do tree inside */
1285     if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree */
1286     else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim);
1287   }
1288   PetscFunctionReturn(0);
1289 }
1290 
1291 /******************************************************************************/
1292 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) {
1293   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1294   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1295   PetscInt    *pw, *list, *size, **nodes;
1296   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1297   MPI_Status   status;
1298   PetscInt     i, mask = 1;
1299 
1300   PetscFunctionBegin;
1301   for (i = 1; i < dim; i++) {
1302     mask <<= 1;
1303     mask++;
1304   }
1305 
1306   /* strip and load s */
1307   msg_list = list = gs->pair_list;
1308   msg_size = size = gs->msg_sizes;
1309   msg_nodes = nodes = gs->node_list;
1310   iptr = pw = gs->pw_elm_list;
1311   dptr1 = dptr3 = gs->pw_vals;
1312   msg_ids_in = ids_in = gs->msg_ids_in;
1313   msg_ids_out = ids_out = gs->msg_ids_out;
1314   dptr2                 = gs->out;
1315   in1 = in2 = gs->in;
1316 
1317   /* post the receives */
1318   /*  msg_nodes=nodes; */
1319   do {
1320     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1321         second one *list and do list++ afterwards */
1322     if ((PCTFS_my_id | mask) == (*list | mask)) {
1323       PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1324       list++;
1325       msg_ids_in++;
1326       in1 += *size++;
1327     } else {
1328       list++;
1329       size++;
1330     }
1331   } while (*++msg_nodes);
1332 
1333   /* load gs values into in out gs buffers */
1334   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1335 
1336   /* load out buffers and post the sends */
1337   msg_nodes = nodes;
1338   list      = msg_list;
1339   while ((iptr = *msg_nodes++)) {
1340     if ((PCTFS_my_id | mask) == (*list | mask)) {
1341       dptr3 = dptr2;
1342       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1343       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1344       /* is msg_ids_out++ correct? */
1345       PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1346       msg_size++;
1347       list++;
1348       msg_ids_out++;
1349     } else {
1350       list++;
1351       msg_size++;
1352     }
1353   }
1354 
1355   /* do the tree while we're waiting */
1356   if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim);
1357 
1358   /* process the received data */
1359   msg_nodes = nodes;
1360   list      = msg_list;
1361   while ((iptr = *nodes++)) {
1362     if ((PCTFS_my_id | mask) == (*list | mask)) {
1363       /* Should I check the return value of MPI_Wait() or status? */
1364       /* Can this loop be replaced by a call to MPI_Waitall()? */
1365       PetscCallMPI(MPI_Wait(ids_in, &status));
1366       ids_in++;
1367       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1368     }
1369     list++;
1370   }
1371 
1372   /* replace vals */
1373   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1374 
1375   /* clear isend message handles */
1376   /* This changed for clarity though it could be the same */
1377   while (*msg_nodes++) {
1378     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1379       /* Should I check the return value of MPI_Wait() or status? */
1380       /* Can this loop be replaced by a call to MPI_Waitall()? */
1381       PetscCallMPI(MPI_Wait(ids_out, &status));
1382       ids_out++;
1383     }
1384     msg_list++;
1385   }
1386   PetscFunctionReturn(0);
1387 }
1388 
1389 /******************************************************************************/
1390 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) {
1391   PetscInt     size;
1392   PetscInt    *in, *out;
1393   PetscScalar *buf, *work;
1394   PetscInt     op[] = {GL_ADD, 0};
1395 
1396   PetscFunctionBegin;
1397   in   = gs->tree_map_in;
1398   out  = gs->tree_map_out;
1399   buf  = gs->tree_buf;
1400   work = gs->tree_work;
1401   size = gs->tree_nel;
1402 
1403   PCTFS_rvec_zero(buf, size);
1404 
1405   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1406 
1407   in  = gs->tree_map_in;
1408   out = gs->tree_map_out;
1409 
1410   PCTFS_grop_hc(buf, work, size, op, dim);
1411 
1412   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1413   PetscFunctionReturn(0);
1414 }
1415