xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision 2ff79c18c26c94ed8cb599682f680f231dca6444)
1 /***********************************gs.c***************************************
2 
3 Author: Henry M. Tufo III
4 
5 e-mail: hmt@cs.brown.edu
6 
7 snail-mail:
8 Division of Applied Mathematics
9 Brown University
10 Providence, RI 02912
11 
12 Last Modification:
13 6.21.97
14 ************************************gs.c**************************************/
15 
16 /***********************************gs.c***************************************
17 File Description:
18 -----------------
19 
20 ************************************gs.c**************************************/
21 
22 #include <../src/ksp/pc/impls/tfs/tfs.h>
23 
24 /* default length of number of items via tree - doubles if exceeded */
25 #define TREE_BUF_SZ 2048
26 #define GS_VEC_SZ   1
27 
28 /***********************************gs.c***************************************
29 Type: struct gather_scatter_id
30 ------------------------------
31 
32 ************************************gs.c**************************************/
33 typedef struct gather_scatter_id {
34   PetscInt     id;
35   PetscInt     nel_min;
36   PetscInt     nel_max;
37   PetscInt     nel_sum;
38   PetscInt     negl;
39   PetscInt     gl_max;
40   PetscInt     gl_min;
41   PetscInt     repeats;
42   PetscInt     ordered;
43   PetscInt     positive;
44   PetscScalar *vals;
45 
46   /* bit mask info */
47   PetscInt *my_proc_mask;
48   PetscInt  mask_sz;
49   PetscInt *ngh_buf;
50   PetscInt  ngh_buf_sz;
51   PetscInt *nghs;
52   PetscInt  num_nghs;
53   PetscInt  max_nghs;
54   PetscInt *pw_nghs;
55   PetscInt  num_pw_nghs;
56   PetscInt *tree_nghs;
57   PetscInt  num_tree_nghs;
58 
59   PetscInt num_loads;
60 
61   /* repeats == true -> local info */
62   PetscInt  nel;  /* number of unique elements */
63   PetscInt *elms; /* of size nel */
64   PetscInt  nel_total;
65   PetscInt *local_elms; /* of size nel_total */
66   PetscInt *companion;  /* of size nel_total */
67 
68   /* local info */
69   PetscInt   num_local_total;
70   PetscInt   local_strength;
71   PetscInt   num_local;
72   PetscInt  *num_local_reduce;
73   PetscInt **local_reduce;
74   PetscInt   num_local_gop;
75   PetscInt  *num_gop_local_reduce;
76   PetscInt **gop_local_reduce;
77 
78   /* pairwise info */
79   PetscInt     level;
80   PetscInt     num_pairs;
81   PetscInt     max_pairs;
82   PetscInt     loc_node_pairs;
83   PetscInt     max_node_pairs;
84   PetscInt     min_node_pairs;
85   PetscInt     avg_node_pairs;
86   PetscInt    *pair_list;
87   PetscInt    *msg_sizes;
88   PetscInt   **node_list;
89   PetscInt     len_pw_list;
90   PetscInt    *pw_elm_list;
91   PetscScalar *pw_vals;
92 
93   MPI_Request *msg_ids_in;
94   MPI_Request *msg_ids_out;
95 
96   PetscScalar *out;
97   PetscScalar *in;
98   PetscInt     msg_total;
99 
100   /* tree - crystal accumulator info */
101   PetscInt   max_left_over;
102   PetscInt  *pre;
103   PetscInt  *in_num;
104   PetscInt  *out_num;
105   PetscInt **in_list;
106   PetscInt **out_list;
107 
108   /* new tree work*/
109   PetscInt     tree_nel;
110   PetscInt    *tree_elms;
111   PetscScalar *tree_buf;
112   PetscScalar *tree_work;
113 
114   PetscInt  tree_map_sz;
115   PetscInt *tree_map_in;
116   PetscInt *tree_map_out;
117 
118   /* current memory status */
119   PetscInt gl_bss_min;
120   PetscInt gl_perm_min;
121 
122   /* max segment size for PCTFS_gs_gop_vec() */
123   PetscInt vec_sz;
124 
125   /* hack to make paul happy */
126   MPI_Comm PCTFS_gs_comm;
127 
128 } PCTFS_gs_id;
129 
130 static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
131 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
132 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
133 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
134 static PCTFS_gs_id   *gsi_new(void);
135 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
136 
137 /* same for all but vector flavor */
138 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
139 /* vector flavor */
140 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
141 
142 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
143 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
145 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147 
148 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
149 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150 
151 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
152 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
154 
155 /* global vars */
156 /* from comm.c module */
157 
158 static PetscInt num_gs_ids = 0;
159 
160 /* should make this dynamic ... later */
161 static PetscInt  msg_buf     = MAX_MSG_BUF;
162 static PetscInt  vec_sz      = GS_VEC_SZ;
163 static PetscInt *tree_buf    = NULL;
164 static PetscInt  tree_buf_sz = 0;
165 static PetscInt  ntree       = 0;
166 
167 /***************************************************************************/
168 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
169 {
170   PetscFunctionBegin;
171   vec_sz = size;
172   PetscFunctionReturn(PETSC_SUCCESS);
173 }
174 
175 /******************************************************************************/
176 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177 {
178   PetscFunctionBegin;
179   msg_buf = buf_size;
180   PetscFunctionReturn(PETSC_SUCCESS);
181 }
182 
183 /******************************************************************************/
184 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
185 {
186   PCTFS_gs_id *gs;
187   MPI_Group    PCTFS_gs_group;
188   MPI_Comm     PCTFS_gs_comm;
189 
190   /* ensure that communication package has been initialized */
191   PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
192 
193   /* determines if we have enough dynamic/semi-static memory */
194   /* checks input, allocs and sets gd_id template            */
195   gs = gsi_check_args(elms, nel, level);
196 
197   /* only bit mask version up and working for the moment    */
198   /* LATER :: get int list version working for sparse pblms */
199   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
200 
201   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
202   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
203   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
204 
205   gs->PCTFS_gs_comm = PCTFS_gs_comm;
206 
207   return gs;
208 }
209 
210 /******************************************************************************/
211 static PCTFS_gs_id *gsi_new(void)
212 {
213   PCTFS_gs_id *gs;
214   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
215   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
216   return gs;
217 }
218 
219 /******************************************************************************/
220 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
221 {
222   PetscInt     i, j, k, t2;
223   PetscInt    *companion, *elms, *unique, *iptr;
224   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
225   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
226   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
227   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
228   PCTFS_gs_id *gs;
229 
230   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
231   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
232 
233   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
234 
235   /* get space for gs template */
236   gs     = gsi_new();
237   gs->id = ++num_gs_ids;
238 
239   /* hmt 6.4.99                                            */
240   /* caller can set global ids that don't participate to 0 */
241   /* PCTFS_gs_init ignores all zeros in elm list                 */
242   /* negative global ids are still invalid                 */
243   for (i = j = 0; i < nel; i++) {
244     if (in_elms[i] != 0) j++;
245   }
246 
247   k   = nel;
248   nel = j;
249 
250   /* copy over in_elms list and create inverse map */
251   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
252   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
253 
254   for (i = j = 0; i < k; i++) {
255     if (in_elms[i] != 0) {
256       elms[j]        = in_elms[i];
257       companion[j++] = i;
258     }
259   }
260 
261   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
262 
263   /* pre-pass ... check to see if sorted */
264   elms[nel] = INT_MAX;
265   iptr      = elms;
266   unique    = elms + 1;
267   j         = 0;
268   while (*iptr != INT_MAX) {
269     if (*iptr++ > *unique++) {
270       j = 1;
271       break;
272     }
273   }
274 
275   /* set up inverse map */
276   if (j) {
277     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
278     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
279   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
280   elms[nel] = INT_MIN;
281 
282   /* first pass */
283   /* determine number of unique elements, check pd */
284   for (i = k = 0; i < nel; i += j) {
285     t2 = elms[i];
286     j  = ++i;
287 
288     /* clump 'em for now */
289     while (elms[j] == t2) j++;
290 
291     /* how many together and num local */
292     if (j -= i) {
293       num_local++;
294       k += j;
295     }
296   }
297 
298   /* how many unique elements? */
299   gs->repeats = k;
300   gs->nel     = nel - k;
301 
302   /* number of repeats? */
303   gs->num_local = num_local;
304   num_local += 2;
305   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
306   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
307 
308   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
309   gs->elms       = unique;
310   gs->nel_total  = nel;
311   gs->local_elms = elms;
312   gs->companion  = companion;
313 
314   /* compess map as well as keep track of local ops */
315   for (num_local = i = j = 0; i < gs->nel; i++) {
316     k  = j;
317     t2 = unique[i] = elms[j];
318     companion[i]   = companion[j];
319 
320     while (elms[j] == t2) j++;
321 
322     if ((t2 = (j - k)) > 1) {
323       /* number together */
324       num_to_reduce[num_local] = t2++;
325 
326       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
327 
328       /* to use binary searching don't remap until we check intersection */
329       *iptr++ = i;
330 
331       /* note that we're skipping the first one */
332       while (++k < j) *(iptr++) = companion[k];
333       *iptr = -1;
334     }
335   }
336 
337   /* sentinel for ngh_buf */
338   unique[gs->nel] = INT_MAX;
339 
340   /* for two partition sort hack */
341   num_to_reduce[num_local]   = 0;
342   local_reduce[num_local]    = NULL;
343   num_to_reduce[++num_local] = 0;
344   local_reduce[num_local]    = NULL;
345 
346   /* load 'em up */
347   /* note one extra to hold NON_UNIFORM flag!!! */
348   vals[2] = vals[1] = vals[0] = nel;
349   if (gs->nel > 0) {
350     vals[3] = unique[0];
351     vals[4] = unique[gs->nel - 1];
352   } else {
353     vals[3] = INT_MAX;
354     vals[4] = INT_MIN;
355   }
356   vals[5] = level;
357   vals[6] = num_gs_ids;
358 
359   /* GLOBAL: send 'em out */
360   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
361 
362   /* must be semi-pos def - only pairwise depends on this */
363   /* LATER - remove this restriction */
364   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
365   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
366 
367   gs->nel_min = vals[0];
368   gs->nel_max = vals[1];
369   gs->nel_sum = vals[2];
370   gs->gl_min  = vals[3];
371   gs->gl_max  = vals[4];
372   gs->negl    = vals[4] - vals[3] + 1;
373 
374   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
375 
376   /* LATER :: add level == -1 -> program selects level */
377   if (vals[5] < 0) vals[5] = 0;
378   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
379   gs->level = vals[5];
380 
381   return gs;
382 }
383 
384 /******************************************************************************/
385 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
386 {
387   PetscInt   i, nel, *elms;
388   PetscInt   t1;
389   PetscInt **reduce;
390   PetscInt  *map;
391 
392   PetscFunctionBegin;
393   /* totally local removes ... PCTFS_ct_bits == 0 */
394   PetscCall(get_ngh_buf(gs));
395 
396   if (gs->level) PetscCall(set_pairwise(gs));
397   if (gs->max_left_over) PetscCall(set_tree(gs));
398 
399   /* intersection local and pairwise/tree? */
400   gs->num_local_total      = gs->num_local;
401   gs->gop_local_reduce     = gs->local_reduce;
402   gs->num_gop_local_reduce = gs->num_local_reduce;
403 
404   map = gs->companion;
405 
406   /* is there any local compression */
407   if (!gs->num_local) {
408     gs->local_strength = NONE;
409     gs->num_local_gop  = 0;
410   } else {
411     /* ok find intersection */
412     map    = gs->companion;
413     reduce = gs->local_reduce;
414     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
415       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
416         t1++;
417         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
418         gs->num_local_reduce[i] *= -1;
419       }
420       **reduce = map[**reduce];
421     }
422 
423     /* intersection is empty */
424     if (!t1) {
425       gs->local_strength = FULL;
426       gs->num_local_gop  = 0;
427     } else { /* intersection not empty */
428       gs->local_strength = PARTIAL;
429 
430       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
431 
432       gs->num_local_gop   = t1;
433       gs->num_local_total = gs->num_local;
434       gs->num_local -= t1;
435       gs->gop_local_reduce     = gs->local_reduce;
436       gs->num_gop_local_reduce = gs->num_local_reduce;
437 
438       for (i = 0; i < t1; i++) {
439         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
440         gs->num_gop_local_reduce[i] *= -1;
441         gs->local_reduce++;
442         gs->num_local_reduce++;
443       }
444       gs->local_reduce++;
445       gs->num_local_reduce++;
446     }
447   }
448 
449   elms = gs->pw_elm_list;
450   nel  = gs->len_pw_list;
451   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
452 
453   elms = gs->tree_map_in;
454   nel  = gs->tree_map_sz;
455   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
456 
457   /* clean up */
458   free((void *)gs->local_elms);
459   free((void *)gs->companion);
460   free((void *)gs->elms);
461   free((void *)gs->ngh_buf);
462   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465 
466 /******************************************************************************/
467 static PetscErrorCode place_in_tree(PetscInt elm)
468 {
469   PetscInt *tp, n;
470 
471   PetscFunctionBegin;
472   if (ntree == tree_buf_sz) {
473     if (tree_buf_sz) {
474       tp = tree_buf;
475       n  = tree_buf_sz;
476       tree_buf_sz <<= 1;
477       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
478       PCTFS_ivec_copy(tree_buf, tp, n);
479       free(tp);
480     } else {
481       tree_buf_sz = TREE_BUF_SZ;
482       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
483     }
484   }
485 
486   tree_buf[ntree++] = elm;
487   PetscFunctionReturn(PETSC_SUCCESS);
488 }
489 
490 /******************************************************************************/
491 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
492 {
493   PetscInt  i, j, npw = 0, ntree_map = 0;
494   PetscInt  p_mask_size, ngh_buf_size, buf_size;
495   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
496   PetscInt *ngh_buf, *buf1, *buf2;
497   PetscInt  offset, per_load, num_loads, or_ct, start, end;
498   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
499   PetscInt  oper = GL_B_OR;
500   PetscInt *ptr3, *t_mask, level, ct1, ct2;
501 
502   PetscFunctionBegin;
503   /* to make life easier */
504   nel   = gs->nel;
505   elms  = gs->elms;
506   level = gs->level;
507 
508   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
509   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
510   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
511 
512   /* allocate space for masks and info bufs */
513   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
514   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
515   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
516   t_mask                        = (PetscInt *)malloc(p_mask_size);
517   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
518 
519   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
520   /* had thought I could exploit rendezvous threshold */
521 
522   /* default is one pass */
523   per_load = negl = gs->negl;
524   gs->num_loads = num_loads = 1;
525   i                         = p_mask_size * negl;
526 
527   /* possible overflow on buffer size */
528   /* overflow hack                    */
529   if (i < 0) i = INT_MAX;
530 
531   buf_size = PetscMin(msg_buf, i);
532 
533   /* can we do it? */
534   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
535 
536   /* get PCTFS_giop buf space ... make *only* one malloc */
537   buf1 = (PetscInt *)malloc(buf_size << 1);
538 
539   /* more than one gior exchange needed? */
540   if (buf_size != i) {
541     per_load      = buf_size / p_mask_size;
542     buf_size      = per_load * p_mask_size;
543     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
544   }
545 
546   /* convert buf sizes from #bytes to #ints - 32-bit only! */
547   p_mask_size /= sizeof(PetscInt);
548   ngh_buf_size /= sizeof(PetscInt);
549   buf_size /= sizeof(PetscInt);
550 
551   /* find PCTFS_giop work space */
552   buf2 = buf1 + buf_size;
553 
554   /* hold #ints needed for processor masks */
555   gs->mask_sz = p_mask_size;
556 
557   /* init buffers */
558   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
559   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
560   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
561 
562   /* HACK reset tree info */
563   tree_buf    = NULL;
564   tree_buf_sz = ntree = 0;
565 
566   /* ok do it */
567   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
568     /* identity for bitwise or is 000...000 */
569     PetscCall(PCTFS_ivec_zero(buf1, buf_size));
570 
571     /* load msg buffer */
572     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
573       offset = (offset - start) * p_mask_size;
574       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
575     }
576 
577     /* GLOBAL: pass buffer */
578     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
579 
580     /* unload buffer into ngh_buf */
581     ptr2 = (elms + i_start);
582     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
583       /* I own it ... may have to pairwise it */
584       if (j == *ptr2) {
585         /* do i share it w/anyone? */
586         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
587         /* guess not */
588         if (ct1 < 2) {
589           ptr2++;
590           ptr1 += p_mask_size;
591           continue;
592         }
593 
594         /* i do ... so keep info and turn off my bit */
595         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
596         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
597         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
598 
599         /* is it to be done pairwise? */
600         if (--ct1 <= level) {
601           npw++;
602 
603           /* turn on high bit to indicate pw need to process */
604           *ptr2++ |= TOP_BIT;
605           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
606           ptr1 += p_mask_size;
607           continue;
608         }
609 
610         /* get set for next and note that I have a tree contribution */
611         /* could save exact elm index for tree here -> save a search */
612         ptr2++;
613         ptr1 += p_mask_size;
614         ntree_map++;
615       } else { /* i don't but still might be involved in tree */
616 
617         /* shared by how many? */
618         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
619 
620         /* none! */
621         if (ct1 < 2) continue;
622 
623         /* is it going to be done pairwise? but not by me of course!*/
624         if (--ct1 <= level) continue;
625       }
626       /* LATER we're going to have to process it NOW */
627       /* nope ... tree it */
628       PetscCall(place_in_tree(j));
629     }
630   }
631 
632   free((void *)t_mask);
633   free((void *)buf1);
634 
635   gs->len_pw_list = npw;
636   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
637 
638   /* expand from bit mask list to int list and save ngh list */
639   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
640   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
641 
642   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
643 
644   oper = GL_MAX;
645   ct1  = gs->num_nghs;
646   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
647   gs->max_nghs = ct1;
648 
649   gs->tree_map_sz   = ntree_map;
650   gs->max_left_over = ntree;
651 
652   free((void *)p_mask);
653   free((void *)sh_proc_mask);
654   PetscFunctionReturn(PETSC_SUCCESS);
655 }
656 
657 /******************************************************************************/
658 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
659 {
660   PetscInt  i, j;
661   PetscInt  p_mask_size;
662   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
663   PetscInt *ngh_buf, *buf2;
664   PetscInt  offset;
665   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
666   PetscInt *pairwise_elm_list, len_pair_list = 0;
667   PetscInt *iptr, t1, i_start, nel, *elms;
668   PetscInt  ct;
669 
670   PetscFunctionBegin;
671   /* to make life easier */
672   nel          = gs->nel;
673   elms         = gs->elms;
674   ngh_buf      = gs->ngh_buf;
675   sh_proc_mask = gs->pw_nghs;
676 
677   /* need a few temp masks */
678   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
679   p_mask        = (PetscInt *)malloc(p_mask_size);
680   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
681 
682   /* set mask to my PCTFS_my_id's bit mask */
683   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
684 
685   p_mask_size /= sizeof(PetscInt);
686 
687   len_pair_list   = gs->len_pw_list;
688   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
689 
690   /* how many processors (nghs) do we have to exchange with? */
691   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
692 
693   /* allocate space for PCTFS_gs_gop() info */
694   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
695   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
696   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
697 
698   /* init msg_size list */
699   PetscCall(PCTFS_ivec_zero(msg_size, nprs));
700 
701   /* expand from bit mask list to int list */
702   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
703 
704   /* keep list of elements being handled pairwise */
705   for (i = j = 0; i < nel; i++) {
706     if (elms[i] & TOP_BIT) {
707       elms[i] ^= TOP_BIT;
708       pairwise_elm_list[j++] = i;
709     }
710   }
711   pairwise_elm_list[j] = -1;
712 
713   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
714   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
715   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
716   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
717   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
718 
719   /* find who goes to each processor */
720   for (i_start = i = 0; i < nprs; i++) {
721     /* processor i's mask */
722     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
723 
724     /* det # going to processor i */
725     for (ct = j = 0; j < len_pair_list; j++) {
726       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
727       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
728       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
729     }
730     msg_size[i] = ct;
731     i_start     = PetscMax(i_start, ct);
732 
733     /*space to hold nodes in message to first neighbor */
734     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
735 
736     for (j = 0; j < len_pair_list; j++) {
737       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
738       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
739       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
740     }
741     *iptr = -1;
742   }
743   msg_nodes[nprs] = NULL;
744 
745   j = gs->loc_node_pairs = i_start;
746   t1                     = GL_MAX;
747   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
748   gs->max_node_pairs = i_start;
749 
750   i_start = j;
751   t1      = GL_MIN;
752   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
753   gs->min_node_pairs = i_start;
754 
755   i_start = j;
756   t1      = GL_ADD;
757   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
758   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
759 
760   i_start = nprs;
761   t1      = GL_MAX;
762   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
763   gs->max_pairs = i_start;
764 
765   /* remap pairwise in tail of gsi_via_bit_mask() */
766   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
767   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
768   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
769 
770   /* reset malloc pool */
771   free((void *)p_mask);
772   free((void *)tmp_proc_mask);
773   PetscFunctionReturn(PETSC_SUCCESS);
774 }
775 
776 /* to do pruned tree just save ngh buf copy for each one and decode here!
777 ******************************************************************************/
778 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
779 {
780   PetscInt  i, j, n, nel;
781   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
782 
783   PetscFunctionBegin;
784   /* local work ptrs */
785   elms = gs->elms;
786   nel  = gs->nel;
787 
788   /* how many via tree */
789   gs->tree_nel = n = ntree;
790   gs->tree_elms = tree_elms = iptr_in = tree_buf;
791   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
792   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
793   j                                   = gs->tree_map_sz;
794   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
795   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
796 
797   /* search the longer of the two lists */
798   /* note ... could save this info in get_ngh_buf and save searches */
799   if (n <= nel) {
800     /* bijective fct w/remap - search elm list */
801     for (i = 0; i < n; i++) {
802       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
803         *iptr_in++  = j;
804         *iptr_out++ = i;
805       }
806     }
807   } else {
808     for (i = 0; i < nel; i++) {
809       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
810         *iptr_in++  = i;
811         *iptr_out++ = j;
812       }
813     }
814   }
815 
816   /* sentinel */
817   *iptr_in = *iptr_out = -1;
818   PetscFunctionReturn(PETSC_SUCCESS);
819 }
820 
821 /******************************************************************************/
822 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
823 {
824   PetscInt   *num, *map, **reduce;
825   PetscScalar tmp;
826 
827   PetscFunctionBegin;
828   num    = gs->num_gop_local_reduce;
829   reduce = gs->gop_local_reduce;
830   while ((map = *reduce++)) {
831     /* wall */
832     if (*num == 2) {
833       num++;
834       vals[map[1]] = vals[map[0]];
835     } else if (*num == 3) { /* corner shared by three elements */
836       num++;
837       vals[map[2]] = vals[map[1]] = vals[map[0]];
838     } else if (*num == 4) { /* corner shared by four elements */
839       num++;
840       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
841     } else { /* general case ... odd geoms ... 3D*/
842       num++;
843       tmp = *(vals + *map++);
844       while (*map >= 0) *(vals + *map++) = tmp;
845     }
846   }
847   PetscFunctionReturn(PETSC_SUCCESS);
848 }
849 
850 /******************************************************************************/
851 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
852 {
853   PetscInt   *num, *map, **reduce;
854   PetscScalar tmp;
855 
856   PetscFunctionBegin;
857   num    = gs->num_local_reduce;
858   reduce = gs->local_reduce;
859   while ((map = *reduce)) {
860     /* wall */
861     if (*num == 2) {
862       num++;
863       reduce++;
864       vals[map[1]] = vals[map[0]] += vals[map[1]];
865     } else if (*num == 3) { /* corner shared by three elements */
866       num++;
867       reduce++;
868       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
869     } else if (*num == 4) { /* corner shared by four elements */
870       num++;
871       reduce++;
872       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
873     } else { /* general case ... odd geoms ... 3D*/
874       num++;
875       tmp = 0.0;
876       while (*map >= 0) tmp += *(vals + *map++);
877 
878       map = *reduce++;
879       while (*map >= 0) *(vals + *map++) = tmp;
880     }
881   }
882   PetscFunctionReturn(PETSC_SUCCESS);
883 }
884 
885 /******************************************************************************/
886 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
887 {
888   PetscInt    *num, *map, **reduce;
889   PetscScalar *base;
890 
891   PetscFunctionBegin;
892   num    = gs->num_gop_local_reduce;
893   reduce = gs->gop_local_reduce;
894   while ((map = *reduce++)) {
895     /* wall */
896     if (*num == 2) {
897       num++;
898       vals[map[0]] += vals[map[1]];
899     } else if (*num == 3) { /* corner shared by three elements */
900       num++;
901       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
902     } else if (*num == 4) { /* corner shared by four elements */
903       num++;
904       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
905     } else { /* general case ... odd geoms ... 3D*/
906       num++;
907       base = vals + *map++;
908       while (*map >= 0) *base += *(vals + *map++);
909     }
910   }
911   PetscFunctionReturn(PETSC_SUCCESS);
912 }
913 
914 /******************************************************************************/
915 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
916 {
917   PetscInt i;
918 
919   PetscFunctionBegin;
920   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
921   if (gs->nghs) free((void *)gs->nghs);
922   if (gs->pw_nghs) free((void *)gs->pw_nghs);
923 
924   /* tree */
925   if (gs->max_left_over) {
926     if (gs->tree_elms) free((void *)gs->tree_elms);
927     if (gs->tree_buf) free((void *)gs->tree_buf);
928     if (gs->tree_work) free((void *)gs->tree_work);
929     if (gs->tree_map_in) free((void *)gs->tree_map_in);
930     if (gs->tree_map_out) free((void *)gs->tree_map_out);
931   }
932 
933   /* pairwise info */
934   if (gs->num_pairs) {
935     /* should be NULL already */
936     if (gs->ngh_buf) free((void *)gs->ngh_buf);
937     if (gs->elms) free((void *)gs->elms);
938     if (gs->local_elms) free((void *)gs->local_elms);
939     if (gs->companion) free((void *)gs->companion);
940 
941     /* only set if pairwise */
942     if (gs->vals) free((void *)gs->vals);
943     if (gs->in) free((void *)gs->in);
944     if (gs->out) free((void *)gs->out);
945     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
946     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
947     if (gs->pw_vals) free((void *)gs->pw_vals);
948     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
949     if (gs->node_list) {
950       for (i = 0; i < gs->num_pairs; i++) {
951         if (gs->node_list[i]) free((void *)gs->node_list[i]);
952       }
953       free((void *)gs->node_list);
954     }
955     if (gs->msg_sizes) free((void *)gs->msg_sizes);
956     if (gs->pair_list) free((void *)gs->pair_list);
957   }
958 
959   /* local info */
960   if (gs->num_local_total >= 0) {
961     for (i = 0; i < gs->num_local_total + 1; i++) {
962       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
963     }
964   }
965 
966   /* if intersection tree/pairwise and local isn't empty */
967   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
968   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
969 
970   free((void *)gs);
971   PetscFunctionReturn(PETSC_SUCCESS);
972 }
973 
974 /******************************************************************************/
975 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
976 {
977   PetscFunctionBegin;
978   switch (*op) {
979   case '+':
980     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
981     break;
982   default:
983     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
984     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
985     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
986     break;
987   }
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 /******************************************************************************/
992 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
993 {
994   PetscFunctionBegin;
995   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
996 
997   /* local only operations!!! */
998   if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
999 
1000   /* if intersection tree/pairwise and local isn't empty */
1001   if (gs->num_local_gop) {
1002     PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
1003 
1004     /* pairwise */
1005     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1006 
1007     /* tree */
1008     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1009 
1010     PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1011   } else { /* if intersection tree/pairwise and local is empty */
1012     /* pairwise */
1013     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1014 
1015     /* tree */
1016     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1017   }
1018   PetscFunctionReturn(PETSC_SUCCESS);
1019 }
1020 
1021 /******************************************************************************/
1022 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1023 {
1024   PetscInt    *num, *map, **reduce;
1025   PetscScalar *base;
1026 
1027   PetscFunctionBegin;
1028   num    = gs->num_local_reduce;
1029   reduce = gs->local_reduce;
1030   while ((map = *reduce)) {
1031     base = vals + map[0] * step;
1032 
1033     /* wall */
1034     if (*num == 2) {
1035       num++;
1036       reduce++;
1037       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1038       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1039     } else if (*num == 3) { /* corner shared by three elements */
1040       num++;
1041       reduce++;
1042       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1043       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1044       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1045       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1046     } else if (*num == 4) { /* corner shared by four elements */
1047       num++;
1048       reduce++;
1049       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1050       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1051       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1052       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1053       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1054       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1055     } else { /* general case ... odd geoms ... 3D */
1056       num++;
1057       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1058 
1059       map = *reduce;
1060       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1061 
1062       reduce++;
1063     }
1064   }
1065   PetscFunctionReturn(PETSC_SUCCESS);
1066 }
1067 
1068 /******************************************************************************/
1069 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1070 {
1071   PetscInt    *num, *map, **reduce;
1072   PetscScalar *base;
1073 
1074   PetscFunctionBegin;
1075   num    = gs->num_gop_local_reduce;
1076   reduce = gs->gop_local_reduce;
1077   while ((map = *reduce++)) {
1078     base = vals + map[0] * step;
1079 
1080     /* wall */
1081     if (*num == 2) {
1082       num++;
1083       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1084     } else if (*num == 3) { /* corner shared by three elements */
1085       num++;
1086       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088     } else if (*num == 4) { /* corner shared by four elements */
1089       num++;
1090       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1091       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1092       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1093     } else { /* general case ... odd geoms ... 3D*/
1094       num++;
1095       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1096     }
1097   }
1098   PetscFunctionReturn(PETSC_SUCCESS);
1099 }
1100 
1101 /******************************************************************************/
1102 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1103 {
1104   PetscInt    *num, *map, **reduce;
1105   PetscScalar *base;
1106 
1107   PetscFunctionBegin;
1108   num    = gs->num_gop_local_reduce;
1109   reduce = gs->gop_local_reduce;
1110   while ((map = *reduce++)) {
1111     base = vals + map[0] * step;
1112 
1113     /* wall */
1114     if (*num == 2) {
1115       num++;
1116       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1117     } else if (*num == 3) { /* corner shared by three elements */
1118       num++;
1119       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121     } else if (*num == 4) { /* corner shared by four elements */
1122       num++;
1123       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1124       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1125       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1126     } else { /* general case ... odd geoms ... 3D*/
1127       num++;
1128       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1129     }
1130   }
1131   PetscFunctionReturn(PETSC_SUCCESS);
1132 }
1133 
1134 /******************************************************************************/
1135 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1136 {
1137   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1138   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1139   PetscInt    *pw, *list, *size, **nodes;
1140   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1141   MPI_Status   status;
1142   PetscBLASInt i1 = 1, dstep;
1143 
1144   PetscFunctionBegin;
1145   /* strip and load s */
1146   msg_list = list = gs->pair_list;
1147   msg_size = size = gs->msg_sizes;
1148   msg_nodes = nodes = gs->node_list;
1149   iptr = pw = gs->pw_elm_list;
1150   dptr1 = dptr3 = gs->pw_vals;
1151   msg_ids_in = ids_in = gs->msg_ids_in;
1152   msg_ids_out = ids_out = gs->msg_ids_out;
1153   dptr2                 = gs->out;
1154   in1 = in2 = gs->in;
1155 
1156   /* post the receives */
1157   /*  msg_nodes=nodes; */
1158   do {
1159     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1160         second one *list and do list++ afterwards */
1161     PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1162     list++;
1163     msg_ids_in++;
1164     in1 += *size++ * step;
1165   } while (*++msg_nodes);
1166   msg_nodes = nodes;
1167 
1168   /* load gs values into in out gs buffers */
1169   while (*iptr >= 0) {
1170     PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1171     dptr3 += step;
1172     iptr++;
1173   }
1174 
1175   /* load out buffers and post the sends */
1176   while ((iptr = *msg_nodes++)) {
1177     dptr3 = dptr2;
1178     while (*iptr >= 0) {
1179       PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1180       dptr2 += step;
1181       iptr++;
1182     }
1183     PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1184     msg_size++;
1185     msg_list++;
1186     msg_ids_out++;
1187   }
1188 
1189   /* tree */
1190   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1191 
1192   /* process the received data */
1193   msg_nodes = nodes;
1194   while ((iptr = *nodes++)) {
1195     PetscScalar d1 = 1.0;
1196 
1197     /* Should I check the return value of MPI_Wait() or status? */
1198     /* Can this loop be replaced by a call to MPI_Waitall()? */
1199     PetscCallMPI(MPI_Wait(ids_in, &status));
1200     ids_in++;
1201     while (*iptr >= 0) {
1202       PetscCall(PetscBLASIntCast(step, &dstep));
1203       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1204       in2 += step;
1205       iptr++;
1206     }
1207   }
1208 
1209   /* replace vals */
1210   while (*pw >= 0) {
1211     PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1212     dptr1 += step;
1213     pw++;
1214   }
1215 
1216   /* clear isend message handles */
1217   /* This changed for clarity though it could be the same */
1218 
1219   /* Should I check the return value of MPI_Wait() or status? */
1220   /* Can this loop be replaced by a call to MPI_Waitall()? */
1221   while (*msg_nodes++) {
1222     PetscCallMPI(MPI_Wait(ids_out, &status));
1223     ids_out++;
1224   }
1225   PetscFunctionReturn(PETSC_SUCCESS);
1226 }
1227 
1228 /******************************************************************************/
1229 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1230 {
1231   PetscInt     size, *in, *out;
1232   PetscScalar *buf, *work;
1233   PetscInt     op[] = {GL_ADD, 0};
1234   PetscBLASInt i1   = 1;
1235   PetscBLASInt dstep;
1236 
1237   PetscFunctionBegin;
1238   /* copy over to local variables */
1239   in   = gs->tree_map_in;
1240   out  = gs->tree_map_out;
1241   buf  = gs->tree_buf;
1242   work = gs->tree_work;
1243   size = gs->tree_nel * step;
1244 
1245   /* zero out collection buffer */
1246   PetscCall(PCTFS_rvec_zero(buf, size));
1247 
1248   /* copy over my contributions */
1249   while (*in >= 0) {
1250     PetscCall(PetscBLASIntCast(step, &dstep));
1251     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1252   }
1253 
1254   /* perform fan in/out on full buffer */
1255   /* must change PCTFS_grop to handle the blas */
1256   PetscCall(PCTFS_grop(buf, work, size, op));
1257 
1258   /* reset */
1259   in  = gs->tree_map_in;
1260   out = gs->tree_map_out;
1261 
1262   /* get the portion of the results I need */
1263   while (*in >= 0) {
1264     PetscCall(PetscBLASIntCast(step, &dstep));
1265     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1266   }
1267   PetscFunctionReturn(PETSC_SUCCESS);
1268 }
1269 
1270 /******************************************************************************/
1271 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1272 {
1273   PetscFunctionBegin;
1274   switch (*op) {
1275   case '+':
1276     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1277     break;
1278   default:
1279     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1280     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1281     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1282     break;
1283   }
1284   PetscFunctionReturn(PETSC_SUCCESS);
1285 }
1286 
1287 /******************************************************************************/
1288 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1289 {
1290   PetscFunctionBegin;
1291   /* if there's nothing to do return */
1292   if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1293 
1294   /* can't do more dimensions then exist */
1295   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1296 
1297   /* local only operations!!! */
1298   if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1299 
1300   /* if intersection tree/pairwise and local isn't empty */
1301   if (gs->num_local_gop) {
1302     PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1303 
1304     /* pairwise will do tree inside ... */
1305     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1306     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1307 
1308     PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1309   } else { /* if intersection tree/pairwise and local is empty */
1310     /* pairwise will do tree inside */
1311     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1312     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1313   }
1314   PetscFunctionReturn(PETSC_SUCCESS);
1315 }
1316 
1317 /******************************************************************************/
1318 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1319 {
1320   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1321   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1322   PetscInt    *pw, *list, *size, **nodes;
1323   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1324   MPI_Status   status;
1325   PetscInt     i, mask = 1;
1326 
1327   PetscFunctionBegin;
1328   for (i = 1; i < dim; i++) {
1329     mask <<= 1;
1330     mask++;
1331   }
1332 
1333   /* strip and load s */
1334   msg_list = list = gs->pair_list;
1335   msg_size = size = gs->msg_sizes;
1336   msg_nodes = nodes = gs->node_list;
1337   iptr = pw = gs->pw_elm_list;
1338   dptr1 = dptr3 = gs->pw_vals;
1339   msg_ids_in = ids_in = gs->msg_ids_in;
1340   msg_ids_out = ids_out = gs->msg_ids_out;
1341   dptr2                 = gs->out;
1342   in1 = in2 = gs->in;
1343 
1344   /* post the receives */
1345   /*  msg_nodes=nodes; */
1346   do {
1347     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1348         second one *list and do list++ afterwards */
1349     if ((PCTFS_my_id | mask) == (*list | mask)) {
1350       PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1351       list++;
1352       msg_ids_in++;
1353       in1 += *size++;
1354     } else {
1355       list++;
1356       size++;
1357     }
1358   } while (*++msg_nodes);
1359 
1360   /* load gs values into in out gs buffers */
1361   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1362 
1363   /* load out buffers and post the sends */
1364   msg_nodes = nodes;
1365   list      = msg_list;
1366   while ((iptr = *msg_nodes++)) {
1367     if ((PCTFS_my_id | mask) == (*list | mask)) {
1368       dptr3 = dptr2;
1369       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1370       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1371       /* is msg_ids_out++ correct? */
1372       PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1373       msg_size++;
1374       list++;
1375       msg_ids_out++;
1376     } else {
1377       list++;
1378       msg_size++;
1379     }
1380   }
1381 
1382   /* do the tree while we're waiting */
1383   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1384 
1385   /* process the received data */
1386   msg_nodes = nodes;
1387   list      = msg_list;
1388   while ((iptr = *nodes++)) {
1389     if ((PCTFS_my_id | mask) == (*list | mask)) {
1390       /* Should I check the return value of MPI_Wait() or status? */
1391       /* Can this loop be replaced by a call to MPI_Waitall()? */
1392       PetscCallMPI(MPI_Wait(ids_in, &status));
1393       ids_in++;
1394       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1395     }
1396     list++;
1397   }
1398 
1399   /* replace vals */
1400   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1401 
1402   /* clear isend message handles */
1403   /* This changed for clarity though it could be the same */
1404   while (*msg_nodes++) {
1405     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1406       /* Should I check the return value of MPI_Wait() or status? */
1407       /* Can this loop be replaced by a call to MPI_Waitall()? */
1408       PetscCallMPI(MPI_Wait(ids_out, &status));
1409       ids_out++;
1410     }
1411     msg_list++;
1412   }
1413   PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415 
1416 /******************************************************************************/
1417 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1418 {
1419   PetscInt     size;
1420   PetscInt    *in, *out;
1421   PetscScalar *buf, *work;
1422   PetscInt     op[] = {GL_ADD, 0};
1423 
1424   PetscFunctionBegin;
1425   in   = gs->tree_map_in;
1426   out  = gs->tree_map_out;
1427   buf  = gs->tree_buf;
1428   work = gs->tree_work;
1429   size = gs->tree_nel;
1430 
1431   PetscCall(PCTFS_rvec_zero(buf, size));
1432 
1433   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1434 
1435   in  = gs->tree_map_in;
1436   out = gs->tree_map_out;
1437 
1438   PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1439 
1440   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1441   PetscFunctionReturn(PETSC_SUCCESS);
1442 }
1443