xref: /petsc/src/ksp/pc/impls/tfs/gs.c (revision 03047865b8d8757cf1cf9cda45785c1537b01dc1)
1 /*
2 
3 Author: Henry M. Tufo III
4 
5 e-mail: hmt@cs.brown.edu
6 
7 snail-mail:
8 Division of Applied Mathematics
9 Brown University
10 Providence, RI 02912
11 
12 Last Modification:
13 6.21.97
14 
15 File Description:
16 -----------------
17 
18 */
19 
20 #include <../src/ksp/pc/impls/tfs/tfs.h>
21 
22 /* default length of number of items via tree - doubles if exceeded */
23 #define TREE_BUF_SZ 2048
24 #define GS_VEC_SZ   1
25 
26 /*
27 Type: struct gather_scatter_id
28 */
29 typedef struct gather_scatter_id {
30   PetscInt     id;
31   PetscInt     nel_min;
32   PetscInt     nel_max;
33   PetscInt     nel_sum;
34   PetscInt     negl;
35   PetscInt     gl_max;
36   PetscInt     gl_min;
37   PetscInt     repeats;
38   PetscInt     ordered;
39   PetscInt     positive;
40   PetscScalar *vals;
41 
42   /* bit mask info */
43   PetscInt *my_proc_mask;
44   PetscInt  mask_sz;
45   PetscInt *ngh_buf;
46   PetscInt  ngh_buf_sz;
47   PetscInt *nghs;
48   PetscInt  num_nghs;
49   PetscInt  max_nghs;
50   PetscInt *pw_nghs;
51   PetscInt  num_pw_nghs;
52   PetscInt *tree_nghs;
53   PetscInt  num_tree_nghs;
54 
55   PetscInt num_loads;
56 
57   /* repeats == true -> local info */
58   PetscInt  nel;  /* number of unique elements */
59   PetscInt *elms; /* of size nel */
60   PetscInt  nel_total;
61   PetscInt *local_elms; /* of size nel_total */
62   PetscInt *companion;  /* of size nel_total */
63 
64   /* local info */
65   PetscInt   num_local_total;
66   PetscInt   local_strength;
67   PetscInt   num_local;
68   PetscInt  *num_local_reduce;
69   PetscInt **local_reduce;
70   PetscInt   num_local_gop;
71   PetscInt  *num_gop_local_reduce;
72   PetscInt **gop_local_reduce;
73 
74   /* pairwise info */
75   PetscInt     level;
76   PetscInt     num_pairs;
77   PetscInt     max_pairs;
78   PetscInt     loc_node_pairs;
79   PetscInt     max_node_pairs;
80   PetscInt     min_node_pairs;
81   PetscInt     avg_node_pairs;
82   PetscInt    *pair_list;
83   PetscInt    *msg_sizes;
84   PetscInt   **node_list;
85   PetscInt     len_pw_list;
86   PetscInt    *pw_elm_list;
87   PetscScalar *pw_vals;
88 
89   MPI_Request *msg_ids_in;
90   MPI_Request *msg_ids_out;
91 
92   PetscScalar *out;
93   PetscScalar *in;
94   PetscInt     msg_total;
95 
96   /* tree - crystal accumulator info */
97   PetscInt   max_left_over;
98   PetscInt  *pre;
99   PetscInt  *in_num;
100   PetscInt  *out_num;
101   PetscInt **in_list;
102   PetscInt **out_list;
103 
104   /* new tree work*/
105   PetscInt     tree_nel;
106   PetscInt    *tree_elms;
107   PetscScalar *tree_buf;
108   PetscScalar *tree_work;
109 
110   PetscInt  tree_map_sz;
111   PetscInt *tree_map_in;
112   PetscInt *tree_map_out;
113 
114   /* current memory status */
115   PetscInt gl_bss_min;
116   PetscInt gl_perm_min;
117 
118   /* max segment size for PCTFS_gs_gop_vec() */
119   PetscInt vec_sz;
120 
121   /* hack to make paul happy */
122   MPI_Comm PCTFS_gs_comm;
123 
124 } PCTFS_gs_id;
125 
126 static PCTFS_gs_id   *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
127 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
128 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
129 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
130 static PCTFS_gs_id   *gsi_new(void);
131 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
132 
133 /* same for all but vector flavor */
134 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
135 /* vector flavor */
136 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
137 
138 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
139 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
140 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
141 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143 
144 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
145 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
146 
147 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
148 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
149 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
150 
151 /* global vars */
152 /* from comm.c module */
153 
154 static PetscInt num_gs_ids = 0;
155 
156 /* should make this dynamic ... later */
157 static PetscInt  msg_buf     = MAX_MSG_BUF;
158 static PetscInt  vec_sz      = GS_VEC_SZ;
159 static PetscInt *tree_buf    = NULL;
160 static PetscInt  tree_buf_sz = 0;
161 static PetscInt  ntree       = 0;
162 
163 /***************************************************************************/
PCTFS_gs_init_vec_sz(PetscInt size)164 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
165 {
166   PetscFunctionBegin;
167   vec_sz = size;
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 /******************************************************************************/
PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)172 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
173 {
174   PetscFunctionBegin;
175   msg_buf = buf_size;
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 /******************************************************************************/
PCTFS_gs_init(PetscInt * elms,PetscInt nel,PetscInt level)180 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
181 {
182   PCTFS_gs_id *gs;
183   MPI_Group    PCTFS_gs_group;
184   MPI_Comm     PCTFS_gs_comm;
185 
186   /* ensure that communication package has been initialized */
187   PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
188 
189   /* determines if we have enough dynamic/semi-static memory */
190   /* checks input, allocs and sets gd_id template            */
191   gs = gsi_check_args(elms, nel, level);
192 
193   /* only bit mask version up and working for the moment    */
194   /* LATER :: get int list version working for sparse pblms */
195   PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
196 
197   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
198   PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
199   PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
200 
201   gs->PCTFS_gs_comm = PCTFS_gs_comm;
202 
203   return gs;
204 }
205 
206 /******************************************************************************/
gsi_new(void)207 static PCTFS_gs_id *gsi_new(void)
208 {
209   PCTFS_gs_id *gs;
210   gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
211   PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
212   return gs;
213 }
214 
215 /******************************************************************************/
gsi_check_args(PetscInt * in_elms,PetscInt nel,PetscInt level)216 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
217 {
218   PetscInt     i, j, k, t2;
219   PetscInt    *companion, *elms, *unique, *iptr;
220   PetscInt     num_local = 0, *num_to_reduce, **local_reduce;
221   PetscInt     oprs[]    = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
222   PetscInt     vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
223   PetscInt     work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
224   PCTFS_gs_id *gs;
225 
226   if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
227   if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
228 
229   if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
230 
231   /* get space for gs template */
232   gs     = gsi_new();
233   gs->id = ++num_gs_ids;
234 
235   /* hmt 6.4.99                                            */
236   /* caller can set global ids that don't participate to 0 */
237   /* PCTFS_gs_init ignores all zeros in elm list                 */
238   /* negative global ids are still invalid                 */
239   for (i = j = 0; i < nel; i++) {
240     if (in_elms[i] != 0) j++;
241   }
242 
243   k   = nel;
244   nel = j;
245 
246   /* copy over in_elms list and create inverse map */
247   elms      = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
248   companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
249 
250   for (i = j = 0; i < k; i++) {
251     if (in_elms[i] != 0) {
252       elms[j]        = in_elms[i];
253       companion[j++] = i;
254     }
255   }
256 
257   if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
258 
259   /* pre-pass ... check to see if sorted */
260   elms[nel] = INT_MAX;
261   iptr      = elms;
262   unique    = elms + 1;
263   j         = 0;
264   while (*iptr != INT_MAX) {
265     if (*iptr++ > *unique++) {
266       j = 1;
267       break;
268     }
269   }
270 
271   /* set up inverse map */
272   if (j) {
273     PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
274     PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
275   } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
276   elms[nel] = INT_MIN;
277 
278   /* first pass */
279   /* determine number of unique elements, check pd */
280   for (i = k = 0; i < nel; i += j) {
281     t2 = elms[i];
282     j  = ++i;
283 
284     /* clump 'em for now */
285     while (elms[j] == t2) j++;
286 
287     /* how many together and num local */
288     if (j -= i) {
289       num_local++;
290       k += j;
291     }
292   }
293 
294   /* how many unique elements? */
295   gs->repeats = k;
296   gs->nel     = nel - k;
297 
298   /* number of repeats? */
299   gs->num_local = num_local;
300   num_local += 2;
301   gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
302   gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
303 
304   unique         = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
305   gs->elms       = unique;
306   gs->nel_total  = nel;
307   gs->local_elms = elms;
308   gs->companion  = companion;
309 
310   /* compess map as well as keep track of local ops */
311   for (num_local = i = j = 0; i < gs->nel; i++) {
312     k  = j;
313     t2 = unique[i] = elms[j];
314     companion[i]   = companion[j];
315 
316     while (elms[j] == t2) j++;
317 
318     if ((t2 = (j - k)) > 1) {
319       /* number together */
320       num_to_reduce[num_local] = t2++;
321 
322       iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
323 
324       /* to use binary searching don't remap until we check intersection */
325       *iptr++ = i;
326 
327       /* note that we're skipping the first one */
328       while (++k < j) *(iptr++) = companion[k];
329       *iptr = -1;
330     }
331   }
332 
333   /* sentinel for ngh_buf */
334   unique[gs->nel] = INT_MAX;
335 
336   /* for two partition sort hack */
337   num_to_reduce[num_local]   = 0;
338   local_reduce[num_local]    = NULL;
339   num_to_reduce[++num_local] = 0;
340   local_reduce[num_local]    = NULL;
341 
342   /* load 'em up */
343   /* note one extra to hold NON_UNIFORM flag!!! */
344   vals[2] = vals[1] = vals[0] = nel;
345   if (gs->nel > 0) {
346     vals[3] = unique[0];
347     vals[4] = unique[gs->nel - 1];
348   } else {
349     vals[3] = INT_MAX;
350     vals[4] = INT_MIN;
351   }
352   vals[5] = level;
353   vals[6] = num_gs_ids;
354 
355   /* GLOBAL: send 'em out */
356   PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
357 
358   /* must be semi-pos def - only pairwise depends on this */
359   /* LATER - remove this restriction */
360   if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
361   if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
362 
363   gs->nel_min = vals[0];
364   gs->nel_max = vals[1];
365   gs->nel_sum = vals[2];
366   gs->gl_min  = vals[3];
367   gs->gl_max  = vals[4];
368   gs->negl    = vals[4] - vals[3] + 1;
369 
370   if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
371 
372   /* LATER :: add level == -1 -> program selects level */
373   if (vals[5] < 0) vals[5] = 0;
374   else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
375   gs->level = vals[5];
376 
377   return gs;
378 }
379 
380 /******************************************************************************/
gsi_via_bit_mask(PCTFS_gs_id * gs)381 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
382 {
383   PetscInt   i, nel, *elms;
384   PetscInt   t1;
385   PetscInt **reduce;
386   PetscInt  *map;
387 
388   PetscFunctionBegin;
389   /* totally local removes ... PCTFS_ct_bits == 0 */
390   PetscCall(get_ngh_buf(gs));
391 
392   if (gs->level) PetscCall(set_pairwise(gs));
393   if (gs->max_left_over) PetscCall(set_tree(gs));
394 
395   /* intersection local and pairwise/tree? */
396   gs->num_local_total      = gs->num_local;
397   gs->gop_local_reduce     = gs->local_reduce;
398   gs->num_gop_local_reduce = gs->num_local_reduce;
399 
400   map = gs->companion;
401 
402   /* is there any local compression */
403   if (!gs->num_local) {
404     gs->local_strength = NONE;
405     gs->num_local_gop  = 0;
406   } else {
407     /* ok find intersection */
408     map    = gs->companion;
409     reduce = gs->local_reduce;
410     for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
411       if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
412         t1++;
413         PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
414         gs->num_local_reduce[i] *= -1;
415       }
416       **reduce = map[**reduce];
417     }
418 
419     /* intersection is empty */
420     if (!t1) {
421       gs->local_strength = FULL;
422       gs->num_local_gop  = 0;
423     } else { /* intersection not empty */
424       gs->local_strength = PARTIAL;
425 
426       PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
427 
428       gs->num_local_gop   = t1;
429       gs->num_local_total = gs->num_local;
430       gs->num_local -= t1;
431       gs->gop_local_reduce     = gs->local_reduce;
432       gs->num_gop_local_reduce = gs->num_local_reduce;
433 
434       for (i = 0; i < t1; i++) {
435         PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
436         gs->num_gop_local_reduce[i] *= -1;
437         gs->local_reduce++;
438         gs->num_local_reduce++;
439       }
440       gs->local_reduce++;
441       gs->num_local_reduce++;
442     }
443   }
444 
445   elms = gs->pw_elm_list;
446   nel  = gs->len_pw_list;
447   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
448 
449   elms = gs->tree_map_in;
450   nel  = gs->tree_map_sz;
451   for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
452 
453   /* clean up */
454   free((void *)gs->local_elms);
455   free((void *)gs->companion);
456   free((void *)gs->elms);
457   free((void *)gs->ngh_buf);
458   gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
459   PetscFunctionReturn(PETSC_SUCCESS);
460 }
461 
462 /******************************************************************************/
place_in_tree(PetscInt elm)463 static PetscErrorCode place_in_tree(PetscInt elm)
464 {
465   PetscInt *tp, n;
466 
467   PetscFunctionBegin;
468   if (ntree == tree_buf_sz) {
469     if (tree_buf_sz) {
470       tp = tree_buf;
471       n  = tree_buf_sz;
472       tree_buf_sz <<= 1;
473       tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
474       PCTFS_ivec_copy(tree_buf, tp, n);
475       free(tp);
476     } else {
477       tree_buf_sz = TREE_BUF_SZ;
478       tree_buf    = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479     }
480   }
481 
482   tree_buf[ntree++] = elm;
483   PetscFunctionReturn(PETSC_SUCCESS);
484 }
485 
486 /******************************************************************************/
get_ngh_buf(PCTFS_gs_id * gs)487 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
488 {
489   PetscInt  i, j, npw = 0, ntree_map = 0;
490   PetscInt  p_mask_size, ngh_buf_size, buf_size;
491   PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
492   PetscInt *ngh_buf, *buf1, *buf2;
493   PetscInt  offset, per_load, num_loads, or_ct, start, end;
494   PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
495   PetscInt  oper = GL_B_OR;
496   PetscInt *ptr3, *t_mask, level, ct1, ct2;
497 
498   PetscFunctionBegin;
499   /* to make life easier */
500   nel   = gs->nel;
501   elms  = gs->elms;
502   level = gs->level;
503 
504   /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505   p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
507 
508   /* allocate space for masks and info bufs */
509   gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510   gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511   gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512   t_mask                        = (PetscInt *)malloc(p_mask_size);
513   gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
514 
515   /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516   /* had thought I could exploit rendezvous threshold */
517 
518   /* default is one pass */
519   per_load = negl = gs->negl;
520   gs->num_loads = num_loads = 1;
521   i                         = p_mask_size * negl;
522 
523   /* possible overflow on buffer size */
524   /* overflow hack                    */
525   if (i < 0) i = INT_MAX;
526 
527   buf_size = PetscMin(msg_buf, i);
528 
529   /* can we do it? */
530   PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
531 
532   /* get PCTFS_giop buf space ... make *only* one malloc */
533   buf1 = (PetscInt *)malloc(buf_size << 1);
534 
535   /* more than one gior exchange needed? */
536   if (buf_size != i) {
537     per_load      = buf_size / p_mask_size;
538     buf_size      = per_load * p_mask_size;
539     gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540   }
541 
542   /* convert buf sizes from #bytes to #ints - 32-bit only! */
543   p_mask_size /= sizeof(PetscInt);
544   ngh_buf_size /= sizeof(PetscInt);
545   buf_size /= sizeof(PetscInt);
546 
547   /* find PCTFS_giop work space */
548   buf2 = buf1 + buf_size;
549 
550   /* hold #ints needed for processor masks */
551   gs->mask_sz = p_mask_size;
552 
553   /* init buffers */
554   PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
555   PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
556   PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
557 
558   /* HACK reset tree info */
559   tree_buf    = NULL;
560   tree_buf_sz = ntree = 0;
561 
562   /* ok do it */
563   for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564     /* identity for bitwise or is 000...000 */
565     PetscCall(PCTFS_ivec_zero(buf1, buf_size));
566 
567     /* load msg buffer */
568     for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569       offset = (offset - start) * p_mask_size;
570       PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571     }
572 
573     /* GLOBAL: pass buffer */
574     PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
575 
576     /* unload buffer into ngh_buf */
577     ptr2 = (elms + i_start);
578     for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579       /* I own it ... may have to pairwise it */
580       if (j == *ptr2) {
581         /* do i share it w/anyone? */
582         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583         /* guess not */
584         if (ct1 < 2) {
585           ptr2++;
586           ptr1 += p_mask_size;
587           continue;
588         }
589 
590         /* i do ... so keep info and turn off my bit */
591         PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592         PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
593         PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
594 
595         /* is it to be done pairwise? */
596         if (--ct1 <= level) {
597           npw++;
598 
599           /* turn on high bit to indicate pw need to process */
600           *ptr2++ |= TOP_BIT;
601           PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
602           ptr1 += p_mask_size;
603           continue;
604         }
605 
606         /* get set for next and note that I have a tree contribution */
607         /* could save exact elm index for tree here -> save a search */
608         ptr2++;
609         ptr1 += p_mask_size;
610         ntree_map++;
611       } else { /* i don't but still might be involved in tree */
612 
613         /* shared by how many? */
614         ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
615 
616         /* none! */
617         if (ct1 < 2) continue;
618 
619         /* is it going to be done pairwise? but not by me of course!*/
620         if (--ct1 <= level) continue;
621       }
622       /* LATER we're going to have to process it NOW */
623       /* nope ... tree it */
624       PetscCall(place_in_tree(j));
625     }
626   }
627 
628   free((void *)t_mask);
629   free((void *)buf1);
630 
631   gs->len_pw_list = npw;
632   gs->num_nghs    = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
633 
634   /* expand from bit mask list to int list and save ngh list */
635   gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
637 
638   gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
639 
640   oper = GL_MAX;
641   ct1  = gs->num_nghs;
642   PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
643   gs->max_nghs = ct1;
644 
645   gs->tree_map_sz   = ntree_map;
646   gs->max_left_over = ntree;
647 
648   free((void *)p_mask);
649   free((void *)sh_proc_mask);
650   PetscFunctionReturn(PETSC_SUCCESS);
651 }
652 
653 /******************************************************************************/
set_pairwise(PCTFS_gs_id * gs)654 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655 {
656   PetscInt  i, j;
657   PetscInt  p_mask_size;
658   PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659   PetscInt *ngh_buf, *buf2;
660   PetscInt  offset;
661   PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662   PetscInt *pairwise_elm_list, len_pair_list = 0;
663   PetscInt *iptr, t1, i_start, nel, *elms;
664   PetscInt  ct;
665 
666   PetscFunctionBegin;
667   /* to make life easier */
668   nel          = gs->nel;
669   elms         = gs->elms;
670   ngh_buf      = gs->ngh_buf;
671   sh_proc_mask = gs->pw_nghs;
672 
673   /* need a few temp masks */
674   p_mask_size   = PCTFS_len_bit_mask(PCTFS_num_nodes);
675   p_mask        = (PetscInt *)malloc(p_mask_size);
676   tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
677 
678   /* set mask to my PCTFS_my_id's bit mask */
679   PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
680 
681   p_mask_size /= sizeof(PetscInt);
682 
683   len_pair_list   = gs->len_pw_list;
684   gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
685 
686   /* how many processors (nghs) do we have to exchange with? */
687   nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
688 
689   /* allocate space for PCTFS_gs_gop() info */
690   gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691   gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
692   gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
693 
694   /* init msg_size list */
695   PetscCall(PCTFS_ivec_zero(msg_size, nprs));
696 
697   /* expand from bit mask list to int list */
698   PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
699 
700   /* keep list of elements being handled pairwise */
701   for (i = j = 0; i < nel; i++) {
702     if (elms[i] & TOP_BIT) {
703       elms[i] ^= TOP_BIT;
704       pairwise_elm_list[j++] = i;
705     }
706   }
707   pairwise_elm_list[j] = -1;
708 
709   gs->msg_ids_out       = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
710   gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
711   gs->msg_ids_in        = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
712   gs->msg_ids_in[nprs]  = MPI_REQUEST_NULL;
713   gs->pw_vals           = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
714 
715   /* find who goes to each processor */
716   for (i_start = i = 0; i < nprs; i++) {
717     /* processor i's mask */
718     PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
719 
720     /* det # going to processor i */
721     for (ct = j = 0; j < len_pair_list; j++) {
722       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
723       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
724       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
725     }
726     msg_size[i] = ct;
727     i_start     = PetscMax(i_start, ct);
728 
729     /*space to hold nodes in message to first neighbor */
730     msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
731 
732     for (j = 0; j < len_pair_list; j++) {
733       buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
734       PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
735       if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
736     }
737     *iptr = -1;
738   }
739   msg_nodes[nprs] = NULL;
740 
741   j = gs->loc_node_pairs = i_start;
742   t1                     = GL_MAX;
743   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
744   gs->max_node_pairs = i_start;
745 
746   i_start = j;
747   t1      = GL_MIN;
748   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749   gs->min_node_pairs = i_start;
750 
751   i_start = j;
752   t1      = GL_ADD;
753   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754   gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
755 
756   i_start = nprs;
757   t1      = GL_MAX;
758   PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759   gs->max_pairs = i_start;
760 
761   /* remap pairwise in tail of gsi_via_bit_mask() */
762   gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
763   gs->out       = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
764   gs->in        = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
765 
766   /* reset malloc pool */
767   free((void *)p_mask);
768   free((void *)tmp_proc_mask);
769   PetscFunctionReturn(PETSC_SUCCESS);
770 }
771 
772 /* to do pruned tree just save ngh buf copy for each one and decode here!
773 ******************************************************************************/
set_tree(PCTFS_gs_id * gs)774 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
775 {
776   PetscInt  i, j, n, nel;
777   PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
778 
779   PetscFunctionBegin;
780   /* local work ptrs */
781   elms = gs->elms;
782   nel  = gs->nel;
783 
784   /* how many via tree */
785   gs->tree_nel = n = ntree;
786   gs->tree_elms = tree_elms = iptr_in = tree_buf;
787   gs->tree_buf                        = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
788   gs->tree_work                       = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
789   j                                   = gs->tree_map_sz;
790   gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
791   gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
792 
793   /* search the longer of the two lists */
794   /* note ... could save this info in get_ngh_buf and save searches */
795   if (n <= nel) {
796     /* bijective fct w/remap - search elm list */
797     for (i = 0; i < n; i++) {
798       if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
799         *iptr_in++  = j;
800         *iptr_out++ = i;
801       }
802     }
803   } else {
804     for (i = 0; i < nel; i++) {
805       if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
806         *iptr_in++  = i;
807         *iptr_out++ = j;
808       }
809     }
810   }
811 
812   /* sentinel */
813   *iptr_in = *iptr_out = -1;
814   PetscFunctionReturn(PETSC_SUCCESS);
815 }
816 
817 /******************************************************************************/
PCTFS_gs_gop_local_out(PCTFS_gs_id * gs,PetscScalar * vals)818 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
819 {
820   PetscInt   *num, *map, **reduce;
821   PetscScalar tmp;
822 
823   PetscFunctionBegin;
824   num    = gs->num_gop_local_reduce;
825   reduce = gs->gop_local_reduce;
826   while ((map = *reduce++)) {
827     /* wall */
828     if (*num == 2) {
829       num++;
830       vals[map[1]] = vals[map[0]];
831     } else if (*num == 3) { /* corner shared by three elements */
832       num++;
833       vals[map[2]] = vals[map[1]] = vals[map[0]];
834     } else if (*num == 4) { /* corner shared by four elements */
835       num++;
836       vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
837     } else { /* general case ... odd geoms ... 3D*/
838       num++;
839       tmp = *(vals + *map++);
840       while (*map >= 0) *(vals + *map++) = tmp;
841     }
842   }
843   PetscFunctionReturn(PETSC_SUCCESS);
844 }
845 
846 /******************************************************************************/
PCTFS_gs_gop_local_plus(PCTFS_gs_id * gs,PetscScalar * vals)847 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
848 {
849   PetscInt   *num, *map, **reduce;
850   PetscScalar tmp;
851 
852   PetscFunctionBegin;
853   num    = gs->num_local_reduce;
854   reduce = gs->local_reduce;
855   while ((map = *reduce)) {
856     /* wall */
857     if (*num == 2) {
858       num++;
859       reduce++;
860       vals[map[1]] = vals[map[0]] += vals[map[1]];
861     } else if (*num == 3) { /* corner shared by three elements */
862       num++;
863       reduce++;
864       vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
865     } else if (*num == 4) { /* corner shared by four elements */
866       num++;
867       reduce++;
868       vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
869     } else { /* general case ... odd geoms ... 3D*/
870       num++;
871       tmp = 0.0;
872       while (*map >= 0) tmp += *(vals + *map++);
873 
874       map = *reduce++;
875       while (*map >= 0) *(vals + *map++) = tmp;
876     }
877   }
878   PetscFunctionReturn(PETSC_SUCCESS);
879 }
880 
881 /******************************************************************************/
PCTFS_gs_gop_local_in_plus(PCTFS_gs_id * gs,PetscScalar * vals)882 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
883 {
884   PetscInt    *num, *map, **reduce;
885   PetscScalar *base;
886 
887   PetscFunctionBegin;
888   num    = gs->num_gop_local_reduce;
889   reduce = gs->gop_local_reduce;
890   while ((map = *reduce++)) {
891     /* wall */
892     if (*num == 2) {
893       num++;
894       vals[map[0]] += vals[map[1]];
895     } else if (*num == 3) { /* corner shared by three elements */
896       num++;
897       vals[map[0]] += (vals[map[1]] + vals[map[2]]);
898     } else if (*num == 4) { /* corner shared by four elements */
899       num++;
900       vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
901     } else { /* general case ... odd geoms ... 3D*/
902       num++;
903       base = vals + *map++;
904       while (*map >= 0) *base += *(vals + *map++);
905     }
906   }
907   PetscFunctionReturn(PETSC_SUCCESS);
908 }
909 
910 /******************************************************************************/
PCTFS_gs_free(PCTFS_gs_id * gs)911 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
912 {
913   PetscInt i;
914 
915   PetscFunctionBegin;
916   PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
917   if (gs->nghs) free((void *)gs->nghs);
918   if (gs->pw_nghs) free((void *)gs->pw_nghs);
919 
920   /* tree */
921   if (gs->max_left_over) {
922     if (gs->tree_elms) free((void *)gs->tree_elms);
923     if (gs->tree_buf) free((void *)gs->tree_buf);
924     if (gs->tree_work) free((void *)gs->tree_work);
925     if (gs->tree_map_in) free((void *)gs->tree_map_in);
926     if (gs->tree_map_out) free((void *)gs->tree_map_out);
927   }
928 
929   /* pairwise info */
930   if (gs->num_pairs) {
931     /* should be NULL already */
932     if (gs->ngh_buf) free((void *)gs->ngh_buf);
933     if (gs->elms) free((void *)gs->elms);
934     if (gs->local_elms) free((void *)gs->local_elms);
935     if (gs->companion) free((void *)gs->companion);
936 
937     /* only set if pairwise */
938     if (gs->vals) free((void *)gs->vals);
939     if (gs->in) free((void *)gs->in);
940     if (gs->out) free((void *)gs->out);
941     if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
942     if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
943     if (gs->pw_vals) free((void *)gs->pw_vals);
944     if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
945     if (gs->node_list) {
946       for (i = 0; i < gs->num_pairs; i++) {
947         if (gs->node_list[i]) free((void *)gs->node_list[i]);
948       }
949       free((void *)gs->node_list);
950     }
951     if (gs->msg_sizes) free((void *)gs->msg_sizes);
952     if (gs->pair_list) free((void *)gs->pair_list);
953   }
954 
955   /* local info */
956   if (gs->num_local_total >= 0) {
957     for (i = 0; i < gs->num_local_total + 1; i++) {
958       if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
959     }
960   }
961 
962   /* if intersection tree/pairwise and local isn't empty */
963   if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
964   if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
965 
966   free((void *)gs);
967   PetscFunctionReturn(PETSC_SUCCESS);
968 }
969 
970 /******************************************************************************/
PCTFS_gs_gop_vec(PCTFS_gs_id * gs,PetscScalar * vals,const char * op,PetscInt step)971 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
972 {
973   PetscFunctionBegin;
974   switch (*op) {
975   case '+':
976     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
977     break;
978   default:
979     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
980     PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
981     PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982     break;
983   }
984   PetscFunctionReturn(PETSC_SUCCESS);
985 }
986 
987 /******************************************************************************/
PCTFS_gs_gop_vec_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)988 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
989 {
990   PetscFunctionBegin;
991   PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
992 
993   /* local only operations!!! */
994   if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
995 
996   /* if intersection tree/pairwise and local isn't empty */
997   if (gs->num_local_gop) {
998     PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
999 
1000     /* pairwise */
1001     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1002 
1003     /* tree */
1004     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1005 
1006     PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1007   } else { /* if intersection tree/pairwise and local is empty */
1008     /* pairwise */
1009     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1010 
1011     /* tree */
1012     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1013   }
1014   PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016 
1017 /******************************************************************************/
PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1018 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1019 {
1020   PetscInt    *num, *map, **reduce;
1021   PetscScalar *base;
1022 
1023   PetscFunctionBegin;
1024   num    = gs->num_local_reduce;
1025   reduce = gs->local_reduce;
1026   while ((map = *reduce)) {
1027     base = vals + map[0] * step;
1028 
1029     /* wall */
1030     if (*num == 2) {
1031       num++;
1032       reduce++;
1033       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1034       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1035     } else if (*num == 3) { /* corner shared by three elements */
1036       num++;
1037       reduce++;
1038       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1040       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1041       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1042     } else if (*num == 4) { /* corner shared by four elements */
1043       num++;
1044       reduce++;
1045       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1046       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1047       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1048       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1049       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1050       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1051     } else { /* general case ... odd geoms ... 3D */
1052       num++;
1053       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1054 
1055       map = *reduce;
1056       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1057 
1058       reduce++;
1059     }
1060   }
1061   PetscFunctionReturn(PETSC_SUCCESS);
1062 }
1063 
1064 /******************************************************************************/
PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1065 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1066 {
1067   PetscInt    *num, *map, **reduce;
1068   PetscScalar *base;
1069 
1070   PetscFunctionBegin;
1071   num    = gs->num_gop_local_reduce;
1072   reduce = gs->gop_local_reduce;
1073   while ((map = *reduce++)) {
1074     base = vals + map[0] * step;
1075 
1076     /* wall */
1077     if (*num == 2) {
1078       num++;
1079       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1080     } else if (*num == 3) { /* corner shared by three elements */
1081       num++;
1082       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1083       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1084     } else if (*num == 4) { /* corner shared by four elements */
1085       num++;
1086       PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087       PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088       PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1089     } else { /* general case ... odd geoms ... 3D*/
1090       num++;
1091       while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1092     }
1093   }
1094   PetscFunctionReturn(PETSC_SUCCESS);
1095 }
1096 
1097 /******************************************************************************/
PCTFS_gs_gop_vec_local_out(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1098 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1099 {
1100   PetscInt    *num, *map, **reduce;
1101   PetscScalar *base;
1102 
1103   PetscFunctionBegin;
1104   num    = gs->num_gop_local_reduce;
1105   reduce = gs->gop_local_reduce;
1106   while ((map = *reduce++)) {
1107     base = vals + map[0] * step;
1108 
1109     /* wall */
1110     if (*num == 2) {
1111       num++;
1112       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1113     } else if (*num == 3) { /* corner shared by three elements */
1114       num++;
1115       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1116       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1117     } else if (*num == 4) { /* corner shared by four elements */
1118       num++;
1119       PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120       PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121       PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1122     } else { /* general case ... odd geoms ... 3D*/
1123       num++;
1124       while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1125     }
1126   }
1127   PetscFunctionReturn(PETSC_SUCCESS);
1128 }
1129 
1130 /******************************************************************************/
PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id * gs,PetscScalar * in_vals,PetscInt step)1131 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1132 {
1133   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1134   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1135   PetscInt    *pw, *list, *size, **nodes;
1136   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1137   MPI_Status   status;
1138   PetscBLASInt i1 = 1, dstep;
1139 
1140   PetscFunctionBegin;
1141   /* strip and load s */
1142   msg_list = list = gs->pair_list;
1143   msg_size = size = gs->msg_sizes;
1144   msg_nodes = nodes = gs->node_list;
1145   iptr = pw = gs->pw_elm_list;
1146   dptr1 = dptr3 = gs->pw_vals;
1147   msg_ids_in = ids_in = gs->msg_ids_in;
1148   msg_ids_out = ids_out = gs->msg_ids_out;
1149   dptr2                 = gs->out;
1150   in1 = in2 = gs->in;
1151 
1152   /* post the receives */
1153   /*  msg_nodes=nodes; */
1154   do {
1155     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1156         second one *list and do list++ afterwards */
1157     PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1158     list++;
1159     msg_ids_in++;
1160     in1 += *size++ * step;
1161   } while (*++msg_nodes);
1162   msg_nodes = nodes;
1163 
1164   /* load gs values into in out gs buffers */
1165   while (*iptr >= 0) {
1166     PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1167     dptr3 += step;
1168     iptr++;
1169   }
1170 
1171   /* load out buffers and post the sends */
1172   while ((iptr = *msg_nodes++)) {
1173     dptr3 = dptr2;
1174     while (*iptr >= 0) {
1175       PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1176       dptr2 += step;
1177       iptr++;
1178     }
1179     PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1180     msg_size++;
1181     msg_list++;
1182     msg_ids_out++;
1183   }
1184 
1185   /* tree */
1186   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1187 
1188   /* process the received data */
1189   msg_nodes = nodes;
1190   while ((iptr = *nodes++)) {
1191     PetscScalar d1 = 1.0;
1192 
1193     /* Should I check the return value of MPI_Wait() or status? */
1194     /* Can this loop be replaced by a call to MPI_Waitall()? */
1195     PetscCallMPI(MPI_Wait(ids_in, &status));
1196     ids_in++;
1197     while (*iptr >= 0) {
1198       PetscCall(PetscBLASIntCast(step, &dstep));
1199       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1200       in2 += step;
1201       iptr++;
1202     }
1203   }
1204 
1205   /* replace vals */
1206   while (*pw >= 0) {
1207     PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1208     dptr1 += step;
1209     pw++;
1210   }
1211 
1212   /* clear isend message handles */
1213   /* This changed for clarity though it could be the same */
1214 
1215   /* Should I check the return value of MPI_Wait() or status? */
1216   /* Can this loop be replaced by a call to MPI_Waitall()? */
1217   while (*msg_nodes++) {
1218     PetscCallMPI(MPI_Wait(ids_out, &status));
1219     ids_out++;
1220   }
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
1224 /******************************************************************************/
PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1225 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1226 {
1227   PetscInt     size, *in, *out;
1228   PetscScalar *buf, *work;
1229   PetscInt     op[] = {GL_ADD, 0};
1230   PetscBLASInt i1   = 1;
1231   PetscBLASInt dstep;
1232 
1233   PetscFunctionBegin;
1234   /* copy over to local variables */
1235   in   = gs->tree_map_in;
1236   out  = gs->tree_map_out;
1237   buf  = gs->tree_buf;
1238   work = gs->tree_work;
1239   size = gs->tree_nel * step;
1240 
1241   /* zero out collection buffer */
1242   PetscCall(PCTFS_rvec_zero(buf, size));
1243 
1244   /* copy over my contributions */
1245   while (*in >= 0) {
1246     PetscCall(PetscBLASIntCast(step, &dstep));
1247     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1248   }
1249 
1250   /* perform fan in/out on full buffer */
1251   /* must change PCTFS_grop to handle the blas */
1252   PetscCall(PCTFS_grop(buf, work, size, op));
1253 
1254   /* reset */
1255   in  = gs->tree_map_in;
1256   out = gs->tree_map_out;
1257 
1258   /* get the portion of the results I need */
1259   while (*in >= 0) {
1260     PetscCall(PetscBLASIntCast(step, &dstep));
1261     PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1262   }
1263   PetscFunctionReturn(PETSC_SUCCESS);
1264 }
1265 
1266 /******************************************************************************/
PCTFS_gs_gop_hc(PCTFS_gs_id * gs,PetscScalar * vals,const char * op,PetscInt dim)1267 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1268 {
1269   PetscFunctionBegin;
1270   switch (*op) {
1271   case '+':
1272     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1273     break;
1274   default:
1275     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1276     PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1277     PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278     break;
1279   }
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 /******************************************************************************/
PCTFS_gs_gop_plus_hc(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt dim)1284 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1285 {
1286   PetscFunctionBegin;
1287   /* if there's nothing to do return */
1288   if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1289 
1290   /* can't do more dimensions then exist */
1291   dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1292 
1293   /* local only operations!!! */
1294   if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1295 
1296   /* if intersection tree/pairwise and local isn't empty */
1297   if (gs->num_local_gop) {
1298     PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1299 
1300     /* pairwise will do tree inside ... */
1301     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1302     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1303 
1304     PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1305   } else { /* if intersection tree/pairwise and local is empty */
1306     /* pairwise will do tree inside */
1307     if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1308     else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1309   }
1310   PetscFunctionReturn(PETSC_SUCCESS);
1311 }
1312 
1313 /******************************************************************************/
PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id * gs,PetscScalar * in_vals,PetscInt dim)1314 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1315 {
1316   PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1317   PetscInt    *iptr, *msg_list, *msg_size, **msg_nodes;
1318   PetscInt    *pw, *list, *size, **nodes;
1319   MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1320   MPI_Status   status;
1321   PetscInt     i, mask = 1;
1322 
1323   PetscFunctionBegin;
1324   for (i = 1; i < dim; i++) {
1325     mask <<= 1;
1326     mask++;
1327   }
1328 
1329   /* strip and load s */
1330   msg_list = list = gs->pair_list;
1331   msg_size = size = gs->msg_sizes;
1332   msg_nodes = nodes = gs->node_list;
1333   iptr = pw = gs->pw_elm_list;
1334   dptr1 = dptr3 = gs->pw_vals;
1335   msg_ids_in = ids_in = gs->msg_ids_in;
1336   msg_ids_out = ids_out = gs->msg_ids_out;
1337   dptr2                 = gs->out;
1338   in1 = in2 = gs->in;
1339 
1340   /* post the receives */
1341   /*  msg_nodes=nodes; */
1342   do {
1343     /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1344         second one *list and do list++ afterwards */
1345     if ((PCTFS_my_id | mask) == (*list | mask)) {
1346       PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1347       list++;
1348       msg_ids_in++;
1349       in1 += *size++;
1350     } else {
1351       list++;
1352       size++;
1353     }
1354   } while (*++msg_nodes);
1355 
1356   /* load gs values into in out gs buffers */
1357   while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1358 
1359   /* load out buffers and post the sends */
1360   msg_nodes = nodes;
1361   list      = msg_list;
1362   while ((iptr = *msg_nodes++)) {
1363     if ((PCTFS_my_id | mask) == (*list | mask)) {
1364       dptr3 = dptr2;
1365       while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1366       /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1367       /* is msg_ids_out++ correct? */
1368       PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1369       msg_size++;
1370       list++;
1371       msg_ids_out++;
1372     } else {
1373       list++;
1374       msg_size++;
1375     }
1376   }
1377 
1378   /* do the tree while we're waiting */
1379   if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1380 
1381   /* process the received data */
1382   msg_nodes = nodes;
1383   list      = msg_list;
1384   while ((iptr = *nodes++)) {
1385     if ((PCTFS_my_id | mask) == (*list | mask)) {
1386       /* Should I check the return value of MPI_Wait() or status? */
1387       /* Can this loop be replaced by a call to MPI_Waitall()? */
1388       PetscCallMPI(MPI_Wait(ids_in, &status));
1389       ids_in++;
1390       while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1391     }
1392     list++;
1393   }
1394 
1395   /* replace vals */
1396   while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1397 
1398   /* clear isend message handles */
1399   /* This changed for clarity though it could be the same */
1400   while (*msg_nodes++) {
1401     if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1402       /* Should I check the return value of MPI_Wait() or status? */
1403       /* Can this loop be replaced by a call to MPI_Waitall()? */
1404       PetscCallMPI(MPI_Wait(ids_out, &status));
1405       ids_out++;
1406     }
1407     msg_list++;
1408   }
1409   PetscFunctionReturn(PETSC_SUCCESS);
1410 }
1411 
1412 /******************************************************************************/
PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt dim)1413 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1414 {
1415   PetscInt     size;
1416   PetscInt    *in, *out;
1417   PetscScalar *buf, *work;
1418   PetscInt     op[] = {GL_ADD, 0};
1419 
1420   PetscFunctionBegin;
1421   in   = gs->tree_map_in;
1422   out  = gs->tree_map_out;
1423   buf  = gs->tree_buf;
1424   work = gs->tree_work;
1425   size = gs->tree_nel;
1426 
1427   PetscCall(PCTFS_rvec_zero(buf, size));
1428 
1429   while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1430 
1431   in  = gs->tree_map_in;
1432   out = gs->tree_map_out;
1433 
1434   PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1435 
1436   while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1437   PetscFunctionReturn(PETSC_SUCCESS);
1438 }
1439