xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision bcee047adeeb73090d7e36cc71e39fc287cdbb97)
1 
2 /***********************************gs.c***************************************
3 
4 Author: Henry M. Tufo III
5 
6 e-mail: hmt@cs.brown.edu
7 
8 snail-mail:
9 Division of Applied Mathematics
10 Brown University
11 Providence, RI 02912
12 
13 Last Modification:
14 6.21.97
15 ************************************gs.c**************************************/
16 
17 /***********************************gs.c***************************************
18 File Description:
19 -----------------
20 
21 ************************************gs.c**************************************/
22 
23 #include <../src/ksp/pc/impls/tfs/tfs.h>
24 
25 /* default length of number of items via tree - doubles if exceeded */
26 #define TREE_BUF_SZ 2048;
27 #define GS_VEC_SZ   1
28 
29 /***********************************gs.c***************************************
30 Type: struct gather_scatter_id
31 ------------------------------
32 
33 ************************************gs.c**************************************/
34 typedef struct gather_scatter_id {
35   PetscInt     id;
36   PetscInt     nel_min;
37   PetscInt     nel_max;
38   PetscInt     nel_sum;
39   PetscInt     negl;
40   PetscInt     gl_max;
41   PetscInt     gl_min;
42   PetscInt     repeats;
43   PetscInt     ordered;
44   PetscInt     positive;
45   PetscScalar *vals;
46 
47   /* bit mask info */
48   PetscInt *my_proc_mask;
49   PetscInt  mask_sz;
50   PetscInt *ngh_buf;
51   PetscInt  ngh_buf_sz;
52   PetscInt *nghs;
53   PetscInt  num_nghs;
54   PetscInt  max_nghs;
55   PetscInt *pw_nghs;
56   PetscInt  num_pw_nghs;
57   PetscInt *tree_nghs;
58   PetscInt  num_tree_nghs;
59 
60   PetscInt num_loads;
61 
62   /* repeats == true -> local info */
63   PetscInt  nel;  /* number of unique elememts */
64   PetscInt *elms; /* of size nel */
65   PetscInt  nel_total;
66   PetscInt *local_elms; /* of size nel_total */
67   PetscInt *companion;  /* of size nel_total */
68 
69   /* local info */
70   PetscInt   num_local_total;
71   PetscInt   local_strength;
72   PetscInt   num_local;
73   PetscInt  *num_local_reduce;
74   PetscInt **local_reduce;
75   PetscInt   num_local_gop;
76   PetscInt  *num_gop_local_reduce;
77   PetscInt **gop_local_reduce;
78 
79   /* pairwise info */
80   PetscInt     level;
81   PetscInt     num_pairs;
82   PetscInt     max_pairs;
83   PetscInt     loc_node_pairs;
84   PetscInt     max_node_pairs;
85   PetscInt     min_node_pairs;
86   PetscInt     avg_node_pairs;
87   PetscInt    *pair_list;
88   PetscInt    *msg_sizes;
89   PetscInt   **node_list;
90   PetscInt     len_pw_list;
91   PetscInt    *pw_elm_list;
92   PetscScalar *pw_vals;
93 
94   MPI_Request *msg_ids_in;
95   MPI_Request *msg_ids_out;
96 
97   PetscScalar *out;
98   PetscScalar *in;
99   PetscInt     msg_total;
100 
101   /* tree - crystal accumulator info */
102   PetscInt   max_left_over;
103   PetscInt  *pre;
104   PetscInt  *in_num;
105   PetscInt  *out_num;
106   PetscInt **in_list;
107   PetscInt **out_list;
108 
109   /* new tree work*/
110   PetscInt     tree_nel;
111   PetscInt    *tree_elms;
112   PetscScalar *tree_buf;
113   PetscScalar *tree_work;
114 
115   PetscInt  tree_map_sz;
116   PetscInt *tree_map_in;
117   PetscInt *tree_map_out;
118 
119   /* current memory status */
120   PetscInt gl_bss_min;
121   PetscInt gl_perm_min;
122 
123   /* max segment size for PCTFS_gs_gop_vec() */
124   PetscInt vec_sz;
125 
126   /* hack to make paul happy */
127   MPI_Comm PCTFS_gs_comm;
128 
129 } PCTFS_gs_id;
130 
131 static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135 static PCTFS_gs_id   *gsi_new(void);
136 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
137 
138 /* same for all but vector flavor */
139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140 /* vector flavor */
141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 
143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
148 
149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
151 
152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
155 
156 /* global vars */
157 /* from comm.c module */
158 
159 static PetscInt num_gs_ids = 0;
160 
161 /* should make this dynamic ... later */
162 static PetscInt  msg_buf     = MAX_MSG_BUF;
163 static PetscInt  vec_sz      = GS_VEC_SZ;
164 static PetscInt *tree_buf    = NULL;
165 static PetscInt  tree_buf_sz = 0;
166 static PetscInt  ntree       = 0;
167 
168 /***************************************************************************/
169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170 {
171   PetscFunctionBegin;
172   vec_sz = size;
173   PetscFunctionReturn(PETSC_SUCCESS);
174 }
175 
176 /******************************************************************************/
177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
178 {
179   PetscFunctionBegin;
180   msg_buf = buf_size;
181   PetscFunctionReturn(PETSC_SUCCESS);
182 }
183 
184 /******************************************************************************/
185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
186 {
187   PCTFS_gs_id *gs;
188   MPI_Group    PCTFS_gs_group;
189   MPI_Comm     PCTFS_gs_comm;
190 
191   /* ensure that communication package has been initialized */
192   PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
193 
194   /* determines if we have enough dynamic/semi-static memory */
195   /* checks input, allocs and sets gd_id template            */
196   gs = gsi_check_args(elms, nel, level);
197 
198   /* only bit mask version up and working for the moment    */
199   /* LATER :: get int list version working for sparse pblms */
200   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
201 
202   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
203   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
204   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
205 
206   gs->PCTFS_gs_comm = PCTFS_gs_comm;
207 
208   return (gs);
209 }
210 
211 /******************************************************************************/
212 static PCTFS_gs_id *gsi_new(void)
213 {
214   PCTFS_gs_id *gs;
215   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
216   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
217   return (gs);
218 }
219 
220 /******************************************************************************/
221 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
222 {
223   PetscInt     i, j, k, t2;
224   PetscInt    *companion, *elms, *unique, *iptr;
225   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
226   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
227   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
228   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
229   PCTFS_gs_id *gs;
230 
231   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
232   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
233 
234   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
235 
236   /* get space for gs template */
237   gs     = gsi_new();
238   gs->id = ++num_gs_ids;
239 
240   /* hmt 6.4.99                                            */
241   /* caller can set global ids that don't participate to 0 */
242   /* PCTFS_gs_init ignores all zeros in elm list                 */
243   /* negative global ids are still invalid                 */
244   for (i = j = 0; i < nel; i++) {
245     if (in_elms[i] != 0) j++;
246   }
247 
248   k   = nel;
249   nel = j;
250 
251   /* copy over in_elms list and create inverse map */
252   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
253   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
254 
255   for (i = j = 0; i < k; i++) {
256     if (in_elms[i] != 0) {
257       elms[j]        = in_elms[i];
258       companion[j++] = i;
259     }
260   }
261 
262   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
263 
264   /* pre-pass ... check to see if sorted */
265   elms[nel] = INT_MAX;
266   iptr      = elms;
267   unique    = elms + 1;
268   j         = 0;
269   while (*iptr != INT_MAX) {
270     if (*iptr++ > *unique++) {
271       j = 1;
272       break;
273     }
274   }
275 
276   /* set up inverse map */
277   if (j) {
278     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
279     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
280   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
281   elms[nel] = INT_MIN;
282 
283   /* first pass */
284   /* determine number of unique elements, check pd */
285   for (i = k = 0; i < nel; i += j) {
286     t2 = elms[i];
287     j  = ++i;
288 
289     /* clump 'em for now */
290     while (elms[j] == t2) j++;
291 
292     /* how many together and num local */
293     if (j -= i) {
294       num_local++;
295       k += j;
296     }
297   }
298 
299   /* how many unique elements? */
300   gs->repeats = k;
301   gs->nel     = nel - k;
302 
303   /* number of repeats? */
304   gs->num_local = num_local;
305   num_local += 2;
306   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
307   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
308 
309   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
310   gs->elms       = unique;
311   gs->nel_total  = nel;
312   gs->local_elms = elms;
313   gs->companion  = companion;
314 
315   /* compess map as well as keep track of local ops */
316   for (num_local = i = j = 0; i < gs->nel; i++) {
317     k  = j;
318     t2 = unique[i] = elms[j];
319     companion[i]   = companion[j];
320 
321     while (elms[j] == t2) j++;
322 
323     if ((t2 = (j - k)) > 1) {
324       /* number together */
325       num_to_reduce[num_local] = t2++;
326 
327       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
328 
329       /* to use binary searching don't remap until we check intersection */
330       *iptr++ = i;
331 
332       /* note that we're skipping the first one */
333       while (++k < j) *(iptr++) = companion[k];
334       *iptr = -1;
335     }
336   }
337 
338   /* sentinel for ngh_buf */
339   unique[gs->nel] = INT_MAX;
340 
341   /* for two partition sort hack */
342   num_to_reduce[num_local]   = 0;
343   local_reduce[num_local]    = NULL;
344   num_to_reduce[++num_local] = 0;
345   local_reduce[num_local]    = NULL;
346 
347   /* load 'em up */
348   /* note one extra to hold NON_UNIFORM flag!!! */
349   vals[2] = vals[1] = vals[0] = nel;
350   if (gs->nel > 0) {
351     vals[3] = unique[0];
352     vals[4] = unique[gs->nel - 1];
353   } else {
354     vals[3] = INT_MAX;
355     vals[4] = INT_MIN;
356   }
357   vals[5] = level;
358   vals[6] = num_gs_ids;
359 
360   /* GLOBAL: send 'em out */
361   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
362 
363   /* must be semi-pos def - only pairwise depends on this */
364   /* LATER - remove this restriction */
365   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
366   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
367 
368   gs->nel_min = vals[0];
369   gs->nel_max = vals[1];
370   gs->nel_sum = vals[2];
371   gs->gl_min  = vals[3];
372   gs->gl_max  = vals[4];
373   gs->negl    = vals[4] - vals[3] + 1;
374 
375   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
376 
377   /* LATER :: add level == -1 -> program selects level */
378   if (vals[5] < 0) vals[5] = 0;
379   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
380   gs->level = vals[5];
381 
382   return (gs);
383 }
384 
385 /******************************************************************************/
386 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
387 {
388   PetscInt   i, nel, *elms;
389   PetscInt   t1;
390   PetscInt **reduce;
391   PetscInt  *map;
392 
393   PetscFunctionBegin;
394   /* totally local removes ... PCTFS_ct_bits == 0 */
395   PetscCall(get_ngh_buf(gs));
396 
397   if (gs->level) PetscCall(set_pairwise(gs));
398   if (gs->max_left_over) PetscCall(set_tree(gs));
399 
400   /* intersection local and pairwise/tree? */
401   gs->num_local_total      = gs->num_local;
402   gs->gop_local_reduce     = gs->local_reduce;
403   gs->num_gop_local_reduce = gs->num_local_reduce;
404 
405   map = gs->companion;
406 
407   /* is there any local compression */
408   if (!gs->num_local) {
409     gs->local_strength = NONE;
410     gs->num_local_gop  = 0;
411   } else {
412     /* ok find intersection */
413     map    = gs->companion;
414     reduce = gs->local_reduce;
415     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
416       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
417         t1++;
418         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
419         gs->num_local_reduce[i] *= -1;
420       }
421       **reduce = map[**reduce];
422     }
423 
424     /* intersection is empty */
425     if (!t1) {
426       gs->local_strength = FULL;
427       gs->num_local_gop  = 0;
428     } else { /* intersection not empty */
429       gs->local_strength = PARTIAL;
430 
431       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
432 
433       gs->num_local_gop   = t1;
434       gs->num_local_total = gs->num_local;
435       gs->num_local -= t1;
436       gs->gop_local_reduce     = gs->local_reduce;
437       gs->num_gop_local_reduce = gs->num_local_reduce;
438 
439       for (i = 0; i < t1; i++) {
440         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
441         gs->num_gop_local_reduce[i] *= -1;
442         gs->local_reduce++;
443         gs->num_local_reduce++;
444       }
445       gs->local_reduce++;
446       gs->num_local_reduce++;
447     }
448   }
449 
450   elms = gs->pw_elm_list;
451   nel  = gs->len_pw_list;
452   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
453 
454   elms = gs->tree_map_in;
455   nel  = gs->tree_map_sz;
456   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
457 
458   /* clean up */
459   free((void *)gs->local_elms);
460   free((void *)gs->companion);
461   free((void *)gs->elms);
462   free((void *)gs->ngh_buf);
463   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
464   PetscFunctionReturn(PETSC_SUCCESS);
465 }
466 
467 /******************************************************************************/
468 static PetscErrorCode place_in_tree(PetscInt elm)
469 {
470   PetscInt *tp, n;
471 
472   PetscFunctionBegin;
473   if (ntree == tree_buf_sz) {
474     if (tree_buf_sz) {
475       tp = tree_buf;
476       n  = tree_buf_sz;
477       tree_buf_sz <<= 1;
478       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479       PCTFS_ivec_copy(tree_buf, tp, n);
480       free(tp);
481     } else {
482       tree_buf_sz = TREE_BUF_SZ;
483       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
484     }
485   }
486 
487   tree_buf[ntree++] = elm;
488   PetscFunctionReturn(PETSC_SUCCESS);
489 }
490 
491 /******************************************************************************/
492 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
493 {
494   PetscInt  i, j, npw = 0, ntree_map = 0;
495   PetscInt  p_mask_size, ngh_buf_size, buf_size;
496   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
497   PetscInt *ngh_buf, *buf1, *buf2;
498   PetscInt  offset, per_load, num_loads, or_ct, start, end;
499   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
500   PetscInt  oper = GL_B_OR;
501   PetscInt *ptr3, *t_mask, level, ct1, ct2;
502 
503   PetscFunctionBegin;
504   /* to make life easier */
505   nel   = gs->nel;
506   elms  = gs->elms;
507   level = gs->level;
508 
509   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
510   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
511   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
512 
513   /* allocate space for masks and info bufs */
514   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
515   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
516   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
517   t_mask                        = (PetscInt *)malloc(p_mask_size);
518   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
519 
520   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
521   /* had thought I could exploit rendezvous threshold */
522 
523   /* default is one pass */
524   per_load = negl = gs->negl;
525   gs->num_loads = num_loads = 1;
526   i                         = p_mask_size * negl;
527 
528   /* possible overflow on buffer size */
529   /* overflow hack                    */
530   if (i < 0) i = INT_MAX;
531 
532   buf_size = PetscMin(msg_buf, i);
533 
534   /* can we do it? */
535   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
536 
537   /* get PCTFS_giop buf space ... make *only* one malloc */
538   buf1 = (PetscInt *)malloc(buf_size << 1);
539 
540   /* more than one gior exchange needed? */
541   if (buf_size != i) {
542     per_load      = buf_size / p_mask_size;
543     buf_size      = per_load * p_mask_size;
544     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
545   }
546 
547   /* convert buf sizes from #bytes to #ints - 32-bit only! */
548   p_mask_size /= sizeof(PetscInt);
549   ngh_buf_size /= sizeof(PetscInt);
550   buf_size /= sizeof(PetscInt);
551 
552   /* find PCTFS_giop work space */
553   buf2 = buf1 + buf_size;
554 
555   /* hold #ints needed for processor masks */
556   gs->mask_sz = p_mask_size;
557 
558   /* init buffers */
559   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
560   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
561   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
562 
563   /* HACK reset tree info */
564   tree_buf    = NULL;
565   tree_buf_sz = ntree = 0;
566 
567   /* ok do it */
568   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
569     /* identity for bitwise or is 000...000 */
570     PetscCall(PCTFS_ivec_zero(buf1, buf_size));
571 
572     /* load msg buffer */
573     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
574       offset = (offset - start) * p_mask_size;
575       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
576     }
577 
578     /* GLOBAL: pass buffer */
579     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
580 
581     /* unload buffer into ngh_buf */
582     ptr2 = (elms + i_start);
583     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
584       /* I own it ... may have to pairwise it */
585       if (j == *ptr2) {
586         /* do i share it w/anyone? */
587         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
588         /* guess not */
589         if (ct1 < 2) {
590           ptr2++;
591           ptr1 += p_mask_size;
592           continue;
593         }
594 
595         /* i do ... so keep info and turn off my bit */
596         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
597         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
598         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
599 
600         /* is it to be done pairwise? */
601         if (--ct1 <= level) {
602           npw++;
603 
604           /* turn on high bit to indicate pw need to process */
605           *ptr2++ |= TOP_BIT;
606           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
607           ptr1 += p_mask_size;
608           continue;
609         }
610 
611         /* get set for next and note that I have a tree contribution */
612         /* could save exact elm index for tree here -> save a search */
613         ptr2++;
614         ptr1 += p_mask_size;
615         ntree_map++;
616       } else { /* i don't but still might be involved in tree */
617 
618         /* shared by how many? */
619         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
620 
621         /* none! */
622         if (ct1 < 2) continue;
623 
624         /* is it going to be done pairwise? but not by me of course!*/
625         if (--ct1 <= level) continue;
626       }
627       /* LATER we're going to have to process it NOW */
628       /* nope ... tree it */
629       PetscCall(place_in_tree(j));
630     }
631   }
632 
633   free((void *)t_mask);
634   free((void *)buf1);
635 
636   gs->len_pw_list = npw;
637   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
638 
639   /* expand from bit mask list to int list and save ngh list */
640   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
641   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
642 
643   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
644 
645   oper = GL_MAX;
646   ct1  = gs->num_nghs;
647   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
648   gs->max_nghs = ct1;
649 
650   gs->tree_map_sz   = ntree_map;
651   gs->max_left_over = ntree;
652 
653   free((void *)p_mask);
654   free((void *)sh_proc_mask);
655   PetscFunctionReturn(PETSC_SUCCESS);
656 }
657 
658 /******************************************************************************/
659 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
660 {
661   PetscInt  i, j;
662   PetscInt  p_mask_size;
663   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
664   PetscInt *ngh_buf, *buf2;
665   PetscInt  offset;
666   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
667   PetscInt *pairwise_elm_list, len_pair_list = 0;
668   PetscInt *iptr, t1, i_start, nel, *elms;
669   PetscInt  ct;
670 
671   PetscFunctionBegin;
672   /* to make life easier */
673   nel          = gs->nel;
674   elms         = gs->elms;
675   ngh_buf      = gs->ngh_buf;
676   sh_proc_mask = gs->pw_nghs;
677 
678   /* need a few temp masks */
679   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
680   p_mask        = (PetscInt *)malloc(p_mask_size);
681   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
682 
683   /* set mask to my PCTFS_my_id's bit mask */
684   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
685 
686   p_mask_size /= sizeof(PetscInt);
687 
688   len_pair_list   = gs->len_pw_list;
689   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
690 
691   /* how many processors (nghs) do we have to exchange with? */
692   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
693 
694   /* allocate space for PCTFS_gs_gop() info */
695   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
696   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
697   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
698 
699   /* init msg_size list */
700   PetscCall(PCTFS_ivec_zero(msg_size, nprs));
701 
702   /* expand from bit mask list to int list */
703   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
704 
705   /* keep list of elements being handled pairwise */
706   for (i = j = 0; i < nel; i++) {
707     if (elms[i] & TOP_BIT) {
708       elms[i] ^= TOP_BIT;
709       pairwise_elm_list[j++] = i;
710     }
711   }
712   pairwise_elm_list[j] = -1;
713 
714   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
715   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
716   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
717   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
718   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
719 
720   /* find who goes to each processor */
721   for (i_start = i = 0; i < nprs; i++) {
722     /* processor i's mask */
723     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
724 
725     /* det # going to processor i */
726     for (ct = j = 0; j < len_pair_list; j++) {
727       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
728       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
729       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
730     }
731     msg_size[i] = ct;
732     i_start     = PetscMax(i_start, ct);
733 
734     /*space to hold nodes in message to first neighbor */
735     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
736 
737     for (j = 0; j < len_pair_list; j++) {
738       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
739       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
740       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
741     }
742     *iptr = -1;
743   }
744   msg_nodes[nprs] = NULL;
745 
746   j = gs->loc_node_pairs = i_start;
747   t1                     = GL_MAX;
748   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749   gs->max_node_pairs = i_start;
750 
751   i_start = j;
752   t1      = GL_MIN;
753   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754   gs->min_node_pairs = i_start;
755 
756   i_start = j;
757   t1      = GL_ADD;
758   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
760 
761   i_start = nprs;
762   t1      = GL_MAX;
763   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
764   gs->max_pairs = i_start;
765 
766   /* remap pairwise in tail of gsi_via_bit_mask() */
767   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
768   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
769   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
770 
771   /* reset malloc pool */
772   free((void *)p_mask);
773   free((void *)tmp_proc_mask);
774   PetscFunctionReturn(PETSC_SUCCESS);
775 }
776 
777 /* to do pruned tree just save ngh buf copy for each one and decode here!
778 ******************************************************************************/
779 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
780 {
781   PetscInt  i, j, n, nel;
782   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
783 
784   PetscFunctionBegin;
785   /* local work ptrs */
786   elms = gs->elms;
787   nel  = gs->nel;
788 
789   /* how many via tree */
790   gs->tree_nel = n = ntree;
791   gs->tree_elms = tree_elms = iptr_in = tree_buf;
792   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
793   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
794   j                                   = gs->tree_map_sz;
795   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
796   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
797 
798   /* search the longer of the two lists */
799   /* note ... could save this info in get_ngh_buf and save searches */
800   if (n <= nel) {
801     /* bijective fct w/remap - search elm list */
802     for (i = 0; i < n; i++) {
803       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
804         *iptr_in++  = j;
805         *iptr_out++ = i;
806       }
807     }
808   } else {
809     for (i = 0; i < nel; i++) {
810       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
811         *iptr_in++  = i;
812         *iptr_out++ = j;
813       }
814     }
815   }
816 
817   /* sentinel */
818   *iptr_in = *iptr_out = -1;
819   PetscFunctionReturn(PETSC_SUCCESS);
820 }
821 
822 /******************************************************************************/
823 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
824 {
825   PetscInt   *num, *map, **reduce;
826   PetscScalar tmp;
827 
828   PetscFunctionBegin;
829   num    = gs->num_gop_local_reduce;
830   reduce = gs->gop_local_reduce;
831   while ((map = *reduce++)) {
832     /* wall */
833     if (*num == 2) {
834       num++;
835       vals[map[1]] = vals[map[0]];
836     } else if (*num == 3) { /* corner shared by three elements */
837       num++;
838       vals[map[2]] = vals[map[1]] = vals[map[0]];
839     } else if (*num == 4) { /* corner shared by four elements */
840       num++;
841       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
842     } else { /* general case ... odd geoms ... 3D*/
843       num++;
844       tmp = *(vals + *map++);
845       while (*map >= 0) *(vals + *map++) = tmp;
846     }
847   }
848   PetscFunctionReturn(PETSC_SUCCESS);
849 }
850 
851 /******************************************************************************/
852 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
853 {
854   PetscInt   *num, *map, **reduce;
855   PetscScalar tmp;
856 
857   PetscFunctionBegin;
858   num    = gs->num_local_reduce;
859   reduce = gs->local_reduce;
860   while ((map = *reduce)) {
861     /* wall */
862     if (*num == 2) {
863       num++;
864       reduce++;
865       vals[map[1]] = vals[map[0]] += vals[map[1]];
866     } else if (*num == 3) { /* corner shared by three elements */
867       num++;
868       reduce++;
869       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
870     } else if (*num == 4) { /* corner shared by four elements */
871       num++;
872       reduce++;
873       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
874     } else { /* general case ... odd geoms ... 3D*/
875       num++;
876       tmp = 0.0;
877       while (*map >= 0) tmp += *(vals + *map++);
878 
879       map = *reduce++;
880       while (*map >= 0) *(vals + *map++) = tmp;
881     }
882   }
883   PetscFunctionReturn(PETSC_SUCCESS);
884 }
885 
886 /******************************************************************************/
887 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
888 {
889   PetscInt    *num, *map, **reduce;
890   PetscScalar *base;
891 
892   PetscFunctionBegin;
893   num    = gs->num_gop_local_reduce;
894   reduce = gs->gop_local_reduce;
895   while ((map = *reduce++)) {
896     /* wall */
897     if (*num == 2) {
898       num++;
899       vals[map[0]] += vals[map[1]];
900     } else if (*num == 3) { /* corner shared by three elements */
901       num++;
902       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
903     } else if (*num == 4) { /* corner shared by four elements */
904       num++;
905       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
906     } else { /* general case ... odd geoms ... 3D*/
907       num++;
908       base = vals + *map++;
909       while (*map >= 0) *base += *(vals + *map++);
910     }
911   }
912   PetscFunctionReturn(PETSC_SUCCESS);
913 }
914 
915 /******************************************************************************/
916 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
917 {
918   PetscInt i;
919 
920   PetscFunctionBegin;
921   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
922   if (gs->nghs) free((void *)gs->nghs);
923   if (gs->pw_nghs) free((void *)gs->pw_nghs);
924 
925   /* tree */
926   if (gs->max_left_over) {
927     if (gs->tree_elms) free((void *)gs->tree_elms);
928     if (gs->tree_buf) free((void *)gs->tree_buf);
929     if (gs->tree_work) free((void *)gs->tree_work);
930     if (gs->tree_map_in) free((void *)gs->tree_map_in);
931     if (gs->tree_map_out) free((void *)gs->tree_map_out);
932   }
933 
934   /* pairwise info */
935   if (gs->num_pairs) {
936     /* should be NULL already */
937     if (gs->ngh_buf) free((void *)gs->ngh_buf);
938     if (gs->elms) free((void *)gs->elms);
939     if (gs->local_elms) free((void *)gs->local_elms);
940     if (gs->companion) free((void *)gs->companion);
941 
942     /* only set if pairwise */
943     if (gs->vals) free((void *)gs->vals);
944     if (gs->in) free((void *)gs->in);
945     if (gs->out) free((void *)gs->out);
946     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
947     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
948     if (gs->pw_vals) free((void *)gs->pw_vals);
949     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
950     if (gs->node_list) {
951       for (i = 0; i < gs->num_pairs; i++) {
952         if (gs->node_list[i]) free((void *)gs->node_list[i]);
953       }
954       free((void *)gs->node_list);
955     }
956     if (gs->msg_sizes) free((void *)gs->msg_sizes);
957     if (gs->pair_list) free((void *)gs->pair_list);
958   }
959 
960   /* local info */
961   if (gs->num_local_total >= 0) {
962     for (i = 0; i < gs->num_local_total + 1; i++) {
963       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
964     }
965   }
966 
967   /* if intersection tree/pairwise and local isn't empty */
968   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
969   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
970 
971   free((void *)gs);
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
975 /******************************************************************************/
976 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
977 {
978   PetscFunctionBegin;
979   switch (*op) {
980   case '+':
981     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982     break;
983   default:
984     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
985     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
986     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
987     break;
988   }
989   PetscFunctionReturn(PETSC_SUCCESS);
990 }
991 
992 /******************************************************************************/
993 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
994 {
995   PetscFunctionBegin;
996   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
997 
998   /* local only operations!!! */
999   if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
1000 
1001   /* if intersection tree/pairwise and local isn't empty */
1002   if (gs->num_local_gop) {
1003     PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
1004 
1005     /* pairwise */
1006     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1007 
1008     /* tree */
1009     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1010 
1011     PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1012   } else { /* if intersection tree/pairwise and local is empty */
1013     /* pairwise */
1014     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1015 
1016     /* tree */
1017     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1018   }
1019   PetscFunctionReturn(PETSC_SUCCESS);
1020 }
1021 
1022 /******************************************************************************/
1023 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1024 {
1025   PetscInt    *num, *map, **reduce;
1026   PetscScalar *base;
1027 
1028   PetscFunctionBegin;
1029   num    = gs->num_local_reduce;
1030   reduce = gs->local_reduce;
1031   while ((map = *reduce)) {
1032     base = vals + map[0] * step;
1033 
1034     /* wall */
1035     if (*num == 2) {
1036       num++;
1037       reduce++;
1038       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1040     } else if (*num == 3) { /* corner shared by three elements */
1041       num++;
1042       reduce++;
1043       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1044       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1045       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1046       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1047     } else if (*num == 4) { /* corner shared by four elements */
1048       num++;
1049       reduce++;
1050       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1051       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1052       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1053       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1054       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1055       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1056     } else { /* general case ... odd geoms ... 3D */
1057       num++;
1058       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1059 
1060       map = *reduce;
1061       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1062 
1063       reduce++;
1064     }
1065   }
1066   PetscFunctionReturn(PETSC_SUCCESS);
1067 }
1068 
1069 /******************************************************************************/
1070 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1071 {
1072   PetscInt    *num, *map, **reduce;
1073   PetscScalar *base;
1074 
1075   PetscFunctionBegin;
1076   num    = gs->num_gop_local_reduce;
1077   reduce = gs->gop_local_reduce;
1078   while ((map = *reduce++)) {
1079     base = vals + map[0] * step;
1080 
1081     /* wall */
1082     if (*num == 2) {
1083       num++;
1084       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1085     } else if (*num == 3) { /* corner shared by three elements */
1086       num++;
1087       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1088       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1089     } else if (*num == 4) { /* corner shared by four elements */
1090       num++;
1091       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1092       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1093       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1094     } else { /* general case ... odd geoms ... 3D*/
1095       num++;
1096       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1097     }
1098   }
1099   PetscFunctionReturn(PETSC_SUCCESS);
1100 }
1101 
1102 /******************************************************************************/
1103 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1104 {
1105   PetscInt    *num, *map, **reduce;
1106   PetscScalar *base;
1107 
1108   PetscFunctionBegin;
1109   num    = gs->num_gop_local_reduce;
1110   reduce = gs->gop_local_reduce;
1111   while ((map = *reduce++)) {
1112     base = vals + map[0] * step;
1113 
1114     /* wall */
1115     if (*num == 2) {
1116       num++;
1117       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1118     } else if (*num == 3) { /* corner shared by three elements */
1119       num++;
1120       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1121       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1122     } else if (*num == 4) { /* corner shared by four elements */
1123       num++;
1124       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1125       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1126       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1127     } else { /* general case ... odd geoms ... 3D*/
1128       num++;
1129       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1130     }
1131   }
1132   PetscFunctionReturn(PETSC_SUCCESS);
1133 }
1134 
1135 /******************************************************************************/
1136 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1137 {
1138   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1139   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1140   PetscInt    *pw, *list, *size, **nodes;
1141   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1142   MPI_Status   status;
1143   PetscBLASInt i1 = 1, dstep;
1144 
1145   PetscFunctionBegin;
1146   /* strip and load s */
1147   msg_list = list = gs->pair_list;
1148   msg_size = size = gs->msg_sizes;
1149   msg_nodes = nodes = gs->node_list;
1150   iptr = pw = gs->pw_elm_list;
1151   dptr1 = dptr3 = gs->pw_vals;
1152   msg_ids_in = ids_in = gs->msg_ids_in;
1153   msg_ids_out = ids_out = gs->msg_ids_out;
1154   dptr2                 = gs->out;
1155   in1 = in2 = gs->in;
1156 
1157   /* post the receives */
1158   /*  msg_nodes=nodes; */
1159   do {
1160     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1161         second one *list and do list++ afterwards */
1162     PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1163     list++;
1164     msg_ids_in++;
1165     in1 += *size++ * step;
1166   } while (*++msg_nodes);
1167   msg_nodes = nodes;
1168 
1169   /* load gs values into in out gs buffers */
1170   while (*iptr >= 0) {
1171     PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1172     dptr3 += step;
1173     iptr++;
1174   }
1175 
1176   /* load out buffers and post the sends */
1177   while ((iptr = *msg_nodes++)) {
1178     dptr3 = dptr2;
1179     while (*iptr >= 0) {
1180       PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1181       dptr2 += step;
1182       iptr++;
1183     }
1184     PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1185     msg_size++;
1186     msg_list++;
1187     msg_ids_out++;
1188   }
1189 
1190   /* tree */
1191   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1192 
1193   /* process the received data */
1194   msg_nodes = nodes;
1195   while ((iptr = *nodes++)) {
1196     PetscScalar d1 = 1.0;
1197 
1198     /* Should I check the return value of MPI_Wait() or status? */
1199     /* Can this loop be replaced by a call to MPI_Waitall()? */
1200     PetscCallMPI(MPI_Wait(ids_in, &status));
1201     ids_in++;
1202     while (*iptr >= 0) {
1203       PetscCall(PetscBLASIntCast(step, &dstep));
1204       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1205       in2 += step;
1206       iptr++;
1207     }
1208   }
1209 
1210   /* replace vals */
1211   while (*pw >= 0) {
1212     PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1213     dptr1 += step;
1214     pw++;
1215   }
1216 
1217   /* clear isend message handles */
1218   /* This changed for clarity though it could be the same */
1219 
1220   /* Should I check the return value of MPI_Wait() or status? */
1221   /* Can this loop be replaced by a call to MPI_Waitall()? */
1222   while (*msg_nodes++) {
1223     PetscCallMPI(MPI_Wait(ids_out, &status));
1224     ids_out++;
1225   }
1226   PetscFunctionReturn(PETSC_SUCCESS);
1227 }
1228 
1229 /******************************************************************************/
1230 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1231 {
1232   PetscInt     size, *in, *out;
1233   PetscScalar *buf, *work;
1234   PetscInt     op[] = {GL_ADD, 0};
1235   PetscBLASInt i1   = 1;
1236   PetscBLASInt dstep;
1237 
1238   PetscFunctionBegin;
1239   /* copy over to local variables */
1240   in   = gs->tree_map_in;
1241   out  = gs->tree_map_out;
1242   buf  = gs->tree_buf;
1243   work = gs->tree_work;
1244   size = gs->tree_nel * step;
1245 
1246   /* zero out collection buffer */
1247   PetscCall(PCTFS_rvec_zero(buf, size));
1248 
1249   /* copy over my contributions */
1250   while (*in >= 0) {
1251     PetscCall(PetscBLASIntCast(step, &dstep));
1252     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1253   }
1254 
1255   /* perform fan in/out on full buffer */
1256   /* must change PCTFS_grop to handle the blas */
1257   PetscCall(PCTFS_grop(buf, work, size, op));
1258 
1259   /* reset */
1260   in  = gs->tree_map_in;
1261   out = gs->tree_map_out;
1262 
1263   /* get the portion of the results I need */
1264   while (*in >= 0) {
1265     PetscCall(PetscBLASIntCast(step, &dstep));
1266     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1267   }
1268   PetscFunctionReturn(PETSC_SUCCESS);
1269 }
1270 
1271 /******************************************************************************/
1272 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1273 {
1274   PetscFunctionBegin;
1275   switch (*op) {
1276   case '+':
1277     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278     break;
1279   default:
1280     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1281     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1282     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1283     break;
1284   }
1285   PetscFunctionReturn(PETSC_SUCCESS);
1286 }
1287 
1288 /******************************************************************************/
1289 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1290 {
1291   PetscFunctionBegin;
1292   /* if there's nothing to do return */
1293   if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1294 
1295   /* can't do more dimensions then exist */
1296   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1297 
1298   /* local only operations!!! */
1299   if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1300 
1301   /* if intersection tree/pairwise and local isn't empty */
1302   if (gs->num_local_gop) {
1303     PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1304 
1305     /* pairwise will do tree inside ... */
1306     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1307     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1308 
1309     PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1310   } else { /* if intersection tree/pairwise and local is empty */
1311     /* pairwise will do tree inside */
1312     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1313     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1314   }
1315   PetscFunctionReturn(PETSC_SUCCESS);
1316 }
1317 
1318 /******************************************************************************/
1319 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1320 {
1321   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1322   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1323   PetscInt    *pw, *list, *size, **nodes;
1324   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1325   MPI_Status   status;
1326   PetscInt     i, mask = 1;
1327 
1328   PetscFunctionBegin;
1329   for (i = 1; i < dim; i++) {
1330     mask <<= 1;
1331     mask++;
1332   }
1333 
1334   /* strip and load s */
1335   msg_list = list = gs->pair_list;
1336   msg_size = size = gs->msg_sizes;
1337   msg_nodes = nodes = gs->node_list;
1338   iptr = pw = gs->pw_elm_list;
1339   dptr1 = dptr3 = gs->pw_vals;
1340   msg_ids_in = ids_in = gs->msg_ids_in;
1341   msg_ids_out = ids_out = gs->msg_ids_out;
1342   dptr2                 = gs->out;
1343   in1 = in2 = gs->in;
1344 
1345   /* post the receives */
1346   /*  msg_nodes=nodes; */
1347   do {
1348     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1349         second one *list and do list++ afterwards */
1350     if ((PCTFS_my_id | mask) == (*list | mask)) {
1351       PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1352       list++;
1353       msg_ids_in++;
1354       in1 += *size++;
1355     } else {
1356       list++;
1357       size++;
1358     }
1359   } while (*++msg_nodes);
1360 
1361   /* load gs values into in out gs buffers */
1362   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1363 
1364   /* load out buffers and post the sends */
1365   msg_nodes = nodes;
1366   list      = msg_list;
1367   while ((iptr = *msg_nodes++)) {
1368     if ((PCTFS_my_id | mask) == (*list | mask)) {
1369       dptr3 = dptr2;
1370       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1371       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1372       /* is msg_ids_out++ correct? */
1373       PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1374       msg_size++;
1375       list++;
1376       msg_ids_out++;
1377     } else {
1378       list++;
1379       msg_size++;
1380     }
1381   }
1382 
1383   /* do the tree while we're waiting */
1384   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1385 
1386   /* process the received data */
1387   msg_nodes = nodes;
1388   list      = msg_list;
1389   while ((iptr = *nodes++)) {
1390     if ((PCTFS_my_id | mask) == (*list | mask)) {
1391       /* Should I check the return value of MPI_Wait() or status? */
1392       /* Can this loop be replaced by a call to MPI_Waitall()? */
1393       PetscCallMPI(MPI_Wait(ids_in, &status));
1394       ids_in++;
1395       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1396     }
1397     list++;
1398   }
1399 
1400   /* replace vals */
1401   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1402 
1403   /* clear isend message handles */
1404   /* This changed for clarity though it could be the same */
1405   while (*msg_nodes++) {
1406     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1407       /* Should I check the return value of MPI_Wait() or status? */
1408       /* Can this loop be replaced by a call to MPI_Waitall()? */
1409       PetscCallMPI(MPI_Wait(ids_out, &status));
1410       ids_out++;
1411     }
1412     msg_list++;
1413   }
1414   PetscFunctionReturn(PETSC_SUCCESS);
1415 }
1416 
1417 /******************************************************************************/
1418 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1419 {
1420   PetscInt     size;
1421   PetscInt    *in, *out;
1422   PetscScalar *buf, *work;
1423   PetscInt     op[] = {GL_ADD, 0};
1424 
1425   PetscFunctionBegin;
1426   in   = gs->tree_map_in;
1427   out  = gs->tree_map_out;
1428   buf  = gs->tree_buf;
1429   work = gs->tree_work;
1430   size = gs->tree_nel;
1431 
1432   PetscCall(PCTFS_rvec_zero(buf, size));
1433 
1434   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1435 
1436   in  = gs->tree_map_in;
1437   out = gs->tree_map_out;
1438 
1439   PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1440 
1441   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1442   PetscFunctionReturn(PETSC_SUCCESS);
1443 }
1444