1 /*
2
3 Author: Henry M. Tufo III
4
5 e-mail: hmt@cs.brown.edu
6
7 snail-mail:
8 Division of Applied Mathematics
9 Brown University
10 Providence, RI 02912
11
12 Last Modification:
13 6.21.97
14
15 File Description:
16 -----------------
17
18 */
19
20 #include <../src/ksp/pc/impls/tfs/tfs.h>
21
22 /* default length of number of items via tree - doubles if exceeded */
23 #define TREE_BUF_SZ 2048
24 #define GS_VEC_SZ 1
25
26 /*
27 Type: struct gather_scatter_id
28 */
29 typedef struct gather_scatter_id {
30 PetscInt id;
31 PetscInt nel_min;
32 PetscInt nel_max;
33 PetscInt nel_sum;
34 PetscInt negl;
35 PetscInt gl_max;
36 PetscInt gl_min;
37 PetscInt repeats;
38 PetscInt ordered;
39 PetscInt positive;
40 PetscScalar *vals;
41
42 /* bit mask info */
43 PetscInt *my_proc_mask;
44 PetscInt mask_sz;
45 PetscInt *ngh_buf;
46 PetscInt ngh_buf_sz;
47 PetscInt *nghs;
48 PetscInt num_nghs;
49 PetscInt max_nghs;
50 PetscInt *pw_nghs;
51 PetscInt num_pw_nghs;
52 PetscInt *tree_nghs;
53 PetscInt num_tree_nghs;
54
55 PetscInt num_loads;
56
57 /* repeats == true -> local info */
58 PetscInt nel; /* number of unique elements */
59 PetscInt *elms; /* of size nel */
60 PetscInt nel_total;
61 PetscInt *local_elms; /* of size nel_total */
62 PetscInt *companion; /* of size nel_total */
63
64 /* local info */
65 PetscInt num_local_total;
66 PetscInt local_strength;
67 PetscInt num_local;
68 PetscInt *num_local_reduce;
69 PetscInt **local_reduce;
70 PetscInt num_local_gop;
71 PetscInt *num_gop_local_reduce;
72 PetscInt **gop_local_reduce;
73
74 /* pairwise info */
75 PetscInt level;
76 PetscInt num_pairs;
77 PetscInt max_pairs;
78 PetscInt loc_node_pairs;
79 PetscInt max_node_pairs;
80 PetscInt min_node_pairs;
81 PetscInt avg_node_pairs;
82 PetscInt *pair_list;
83 PetscInt *msg_sizes;
84 PetscInt **node_list;
85 PetscInt len_pw_list;
86 PetscInt *pw_elm_list;
87 PetscScalar *pw_vals;
88
89 MPI_Request *msg_ids_in;
90 MPI_Request *msg_ids_out;
91
92 PetscScalar *out;
93 PetscScalar *in;
94 PetscInt msg_total;
95
96 /* tree - crystal accumulator info */
97 PetscInt max_left_over;
98 PetscInt *pre;
99 PetscInt *in_num;
100 PetscInt *out_num;
101 PetscInt **in_list;
102 PetscInt **out_list;
103
104 /* new tree work*/
105 PetscInt tree_nel;
106 PetscInt *tree_elms;
107 PetscScalar *tree_buf;
108 PetscScalar *tree_work;
109
110 PetscInt tree_map_sz;
111 PetscInt *tree_map_in;
112 PetscInt *tree_map_out;
113
114 /* current memory status */
115 PetscInt gl_bss_min;
116 PetscInt gl_perm_min;
117
118 /* max segment size for PCTFS_gs_gop_vec() */
119 PetscInt vec_sz;
120
121 /* hack to make paul happy */
122 MPI_Comm PCTFS_gs_comm;
123
124 } PCTFS_gs_id;
125
126 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
127 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
128 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
129 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
130 static PCTFS_gs_id *gsi_new(void);
131 static PetscErrorCode set_tree(PCTFS_gs_id *gs);
132
133 /* same for all but vector flavor */
134 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
135 /* vector flavor */
136 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
137
138 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
139 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
140 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
141 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
142 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143
144 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
145 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
146
147 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
148 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
149 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
150
151 /* global vars */
152 /* from comm.c module */
153
154 static PetscInt num_gs_ids = 0;
155
156 /* should make this dynamic ... later */
157 static PetscInt msg_buf = MAX_MSG_BUF;
158 static PetscInt vec_sz = GS_VEC_SZ;
159 static PetscInt *tree_buf = NULL;
160 static PetscInt tree_buf_sz = 0;
161 static PetscInt ntree = 0;
162
163 /***************************************************************************/
PCTFS_gs_init_vec_sz(PetscInt size)164 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
165 {
166 PetscFunctionBegin;
167 vec_sz = size;
168 PetscFunctionReturn(PETSC_SUCCESS);
169 }
170
171 /******************************************************************************/
PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)172 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
173 {
174 PetscFunctionBegin;
175 msg_buf = buf_size;
176 PetscFunctionReturn(PETSC_SUCCESS);
177 }
178
179 /******************************************************************************/
PCTFS_gs_init(PetscInt * elms,PetscInt nel,PetscInt level)180 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
181 {
182 PCTFS_gs_id *gs;
183 MPI_Group PCTFS_gs_group;
184 MPI_Comm PCTFS_gs_comm;
185
186 /* ensure that communication package has been initialized */
187 PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init());
188
189 /* determines if we have enough dynamic/semi-static memory */
190 /* checks input, allocs and sets gd_id template */
191 gs = gsi_check_args(elms, nel, level);
192
193 /* only bit mask version up and working for the moment */
194 /* LATER :: get int list version working for sparse pblms */
195 PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs));
196
197 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
198 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS);
199 PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS);
200
201 gs->PCTFS_gs_comm = PCTFS_gs_comm;
202
203 return gs;
204 }
205
206 /******************************************************************************/
gsi_new(void)207 static PCTFS_gs_id *gsi_new(void)
208 {
209 PCTFS_gs_id *gs;
210 gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id));
211 PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id)));
212 return gs;
213 }
214
215 /******************************************************************************/
gsi_check_args(PetscInt * in_elms,PetscInt nel,PetscInt level)216 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
217 {
218 PetscInt i, j, k, t2;
219 PetscInt *companion, *elms, *unique, *iptr;
220 PetscInt num_local = 0, *num_to_reduce, **local_reduce;
221 PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND};
222 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
223 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1];
224 PCTFS_gs_id *gs;
225
226 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n");
227 if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n");
228
229 if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n"));
230
231 /* get space for gs template */
232 gs = gsi_new();
233 gs->id = ++num_gs_ids;
234
235 /* hmt 6.4.99 */
236 /* caller can set global ids that don't participate to 0 */
237 /* PCTFS_gs_init ignores all zeros in elm list */
238 /* negative global ids are still invalid */
239 for (i = j = 0; i < nel; i++) {
240 if (in_elms[i] != 0) j++;
241 }
242
243 k = nel;
244 nel = j;
245
246 /* copy over in_elms list and create inverse map */
247 elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt));
248 companion = (PetscInt *)malloc(nel * sizeof(PetscInt));
249
250 for (i = j = 0; i < k; i++) {
251 if (in_elms[i] != 0) {
252 elms[j] = in_elms[i];
253 companion[j++] = i;
254 }
255 }
256
257 if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n");
258
259 /* pre-pass ... check to see if sorted */
260 elms[nel] = INT_MAX;
261 iptr = elms;
262 unique = elms + 1;
263 j = 0;
264 while (*iptr != INT_MAX) {
265 if (*iptr++ > *unique++) {
266 j = 1;
267 break;
268 }
269 }
270
271 /* set up inverse map */
272 if (j) {
273 PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n"));
274 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER));
275 } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n"));
276 elms[nel] = INT_MIN;
277
278 /* first pass */
279 /* determine number of unique elements, check pd */
280 for (i = k = 0; i < nel; i += j) {
281 t2 = elms[i];
282 j = ++i;
283
284 /* clump 'em for now */
285 while (elms[j] == t2) j++;
286
287 /* how many together and num local */
288 if (j -= i) {
289 num_local++;
290 k += j;
291 }
292 }
293
294 /* how many unique elements? */
295 gs->repeats = k;
296 gs->nel = nel - k;
297
298 /* number of repeats? */
299 gs->num_local = num_local;
300 num_local += 2;
301 gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *));
302 gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt));
303
304 unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt));
305 gs->elms = unique;
306 gs->nel_total = nel;
307 gs->local_elms = elms;
308 gs->companion = companion;
309
310 /* compess map as well as keep track of local ops */
311 for (num_local = i = j = 0; i < gs->nel; i++) {
312 k = j;
313 t2 = unique[i] = elms[j];
314 companion[i] = companion[j];
315
316 while (elms[j] == t2) j++;
317
318 if ((t2 = (j - k)) > 1) {
319 /* number together */
320 num_to_reduce[num_local] = t2++;
321
322 iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt));
323
324 /* to use binary searching don't remap until we check intersection */
325 *iptr++ = i;
326
327 /* note that we're skipping the first one */
328 while (++k < j) *(iptr++) = companion[k];
329 *iptr = -1;
330 }
331 }
332
333 /* sentinel for ngh_buf */
334 unique[gs->nel] = INT_MAX;
335
336 /* for two partition sort hack */
337 num_to_reduce[num_local] = 0;
338 local_reduce[num_local] = NULL;
339 num_to_reduce[++num_local] = 0;
340 local_reduce[num_local] = NULL;
341
342 /* load 'em up */
343 /* note one extra to hold NON_UNIFORM flag!!! */
344 vals[2] = vals[1] = vals[0] = nel;
345 if (gs->nel > 0) {
346 vals[3] = unique[0];
347 vals[4] = unique[gs->nel - 1];
348 } else {
349 vals[3] = INT_MAX;
350 vals[4] = INT_MIN;
351 }
352 vals[5] = level;
353 vals[6] = num_gs_ids;
354
355 /* GLOBAL: send 'em out */
356 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs));
357
358 /* must be semi-pos def - only pairwise depends on this */
359 /* LATER - remove this restriction */
360 if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n");
361 if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n");
362
363 gs->nel_min = vals[0];
364 gs->nel_max = vals[1];
365 gs->nel_sum = vals[2];
366 gs->gl_min = vals[3];
367 gs->gl_max = vals[4];
368 gs->negl = vals[4] - vals[3] + 1;
369
370 if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl);
371
372 /* LATER :: add level == -1 -> program selects level */
373 if (vals[5] < 0) vals[5] = 0;
374 else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes;
375 gs->level = vals[5];
376
377 return gs;
378 }
379
380 /******************************************************************************/
gsi_via_bit_mask(PCTFS_gs_id * gs)381 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
382 {
383 PetscInt i, nel, *elms;
384 PetscInt t1;
385 PetscInt **reduce;
386 PetscInt *map;
387
388 PetscFunctionBegin;
389 /* totally local removes ... PCTFS_ct_bits == 0 */
390 PetscCall(get_ngh_buf(gs));
391
392 if (gs->level) PetscCall(set_pairwise(gs));
393 if (gs->max_left_over) PetscCall(set_tree(gs));
394
395 /* intersection local and pairwise/tree? */
396 gs->num_local_total = gs->num_local;
397 gs->gop_local_reduce = gs->local_reduce;
398 gs->num_gop_local_reduce = gs->num_local_reduce;
399
400 map = gs->companion;
401
402 /* is there any local compression */
403 if (!gs->num_local) {
404 gs->local_strength = NONE;
405 gs->num_local_gop = 0;
406 } else {
407 /* ok find intersection */
408 map = gs->companion;
409 reduce = gs->local_reduce;
410 for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) {
411 if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) {
412 t1++;
413 PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?");
414 gs->num_local_reduce[i] *= -1;
415 }
416 **reduce = map[**reduce];
417 }
418
419 /* intersection is empty */
420 if (!t1) {
421 gs->local_strength = FULL;
422 gs->num_local_gop = 0;
423 } else { /* intersection not empty */
424 gs->local_strength = PARTIAL;
425
426 PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR));
427
428 gs->num_local_gop = t1;
429 gs->num_local_total = gs->num_local;
430 gs->num_local -= t1;
431 gs->gop_local_reduce = gs->local_reduce;
432 gs->num_gop_local_reduce = gs->num_local_reduce;
433
434 for (i = 0; i < t1; i++) {
435 PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?");
436 gs->num_gop_local_reduce[i] *= -1;
437 gs->local_reduce++;
438 gs->num_local_reduce++;
439 }
440 gs->local_reduce++;
441 gs->num_local_reduce++;
442 }
443 }
444
445 elms = gs->pw_elm_list;
446 nel = gs->len_pw_list;
447 for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
448
449 elms = gs->tree_map_in;
450 nel = gs->tree_map_sz;
451 for (i = 0; i < nel; i++) elms[i] = map[elms[i]];
452
453 /* clean up */
454 free((void *)gs->local_elms);
455 free((void *)gs->companion);
456 free((void *)gs->elms);
457 free((void *)gs->ngh_buf);
458 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
459 PetscFunctionReturn(PETSC_SUCCESS);
460 }
461
462 /******************************************************************************/
place_in_tree(PetscInt elm)463 static PetscErrorCode place_in_tree(PetscInt elm)
464 {
465 PetscInt *tp, n;
466
467 PetscFunctionBegin;
468 if (ntree == tree_buf_sz) {
469 if (tree_buf_sz) {
470 tp = tree_buf;
471 n = tree_buf_sz;
472 tree_buf_sz <<= 1;
473 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
474 PCTFS_ivec_copy(tree_buf, tp, n);
475 free(tp);
476 } else {
477 tree_buf_sz = TREE_BUF_SZ;
478 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt));
479 }
480 }
481
482 tree_buf[ntree++] = elm;
483 PetscFunctionReturn(PETSC_SUCCESS);
484 }
485
486 /******************************************************************************/
get_ngh_buf(PCTFS_gs_id * gs)487 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
488 {
489 PetscInt i, j, npw = 0, ntree_map = 0;
490 PetscInt p_mask_size, ngh_buf_size, buf_size;
491 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
492 PetscInt *ngh_buf, *buf1, *buf2;
493 PetscInt offset, per_load, num_loads, or_ct, start, end;
494 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
495 PetscInt oper = GL_B_OR;
496 PetscInt *ptr3, *t_mask, level, ct1, ct2;
497
498 PetscFunctionBegin;
499 /* to make life easier */
500 nel = gs->nel;
501 elms = gs->elms;
502 level = gs->level;
503
504 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
505 p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes));
506 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
507
508 /* allocate space for masks and info bufs */
509 gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size);
510 gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size);
511 gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel;
512 t_mask = (PetscInt *)malloc(p_mask_size);
513 gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size);
514
515 /* comm buffer size ... memory usage bounded by ~2*msg_buf */
516 /* had thought I could exploit rendezvous threshold */
517
518 /* default is one pass */
519 per_load = negl = gs->negl;
520 gs->num_loads = num_loads = 1;
521 i = p_mask_size * negl;
522
523 /* possible overflow on buffer size */
524 /* overflow hack */
525 if (i < 0) i = INT_MAX;
526
527 buf_size = PetscMin(msg_buf, i);
528
529 /* can we do it? */
530 PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size);
531
532 /* get PCTFS_giop buf space ... make *only* one malloc */
533 buf1 = (PetscInt *)malloc(buf_size << 1);
534
535 /* more than one gior exchange needed? */
536 if (buf_size != i) {
537 per_load = buf_size / p_mask_size;
538 buf_size = per_load * p_mask_size;
539 gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0);
540 }
541
542 /* convert buf sizes from #bytes to #ints - 32-bit only! */
543 p_mask_size /= sizeof(PetscInt);
544 ngh_buf_size /= sizeof(PetscInt);
545 buf_size /= sizeof(PetscInt);
546
547 /* find PCTFS_giop work space */
548 buf2 = buf1 + buf_size;
549
550 /* hold #ints needed for processor masks */
551 gs->mask_sz = p_mask_size;
552
553 /* init buffers */
554 PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size));
555 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size));
556 PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size));
557
558 /* HACK reset tree info */
559 tree_buf = NULL;
560 tree_buf_sz = ntree = 0;
561
562 /* ok do it */
563 for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) {
564 /* identity for bitwise or is 000...000 */
565 PetscCall(PCTFS_ivec_zero(buf1, buf_size));
566
567 /* load msg buffer */
568 for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) {
569 offset = (offset - start) * p_mask_size;
570 PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size);
571 }
572
573 /* GLOBAL: pass buffer */
574 PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper));
575
576 /* unload buffer into ngh_buf */
577 ptr2 = (elms + i_start);
578 for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) {
579 /* I own it ... may have to pairwise it */
580 if (j == *ptr2) {
581 /* do i share it w/anyone? */
582 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
583 /* guess not */
584 if (ct1 < 2) {
585 ptr2++;
586 ptr1 += p_mask_size;
587 continue;
588 }
589
590 /* i do ... so keep info and turn off my bit */
591 PCTFS_ivec_copy(ptr1, ptr3, p_mask_size);
592 PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size));
593 PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size));
594
595 /* is it to be done pairwise? */
596 if (--ct1 <= level) {
597 npw++;
598
599 /* turn on high bit to indicate pw need to process */
600 *ptr2++ |= TOP_BIT;
601 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size));
602 ptr1 += p_mask_size;
603 continue;
604 }
605
606 /* get set for next and note that I have a tree contribution */
607 /* could save exact elm index for tree here -> save a search */
608 ptr2++;
609 ptr1 += p_mask_size;
610 ntree_map++;
611 } else { /* i don't but still might be involved in tree */
612
613 /* shared by how many? */
614 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt));
615
616 /* none! */
617 if (ct1 < 2) continue;
618
619 /* is it going to be done pairwise? but not by me of course!*/
620 if (--ct1 <= level) continue;
621 }
622 /* LATER we're going to have to process it NOW */
623 /* nope ... tree it */
624 PetscCall(place_in_tree(j));
625 }
626 }
627
628 free((void *)t_mask);
629 free((void *)buf1);
630
631 gs->len_pw_list = npw;
632 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
633
634 /* expand from bit mask list to int list and save ngh list */
635 gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt));
636 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs));
637
638 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt));
639
640 oper = GL_MAX;
641 ct1 = gs->num_nghs;
642 PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper));
643 gs->max_nghs = ct1;
644
645 gs->tree_map_sz = ntree_map;
646 gs->max_left_over = ntree;
647
648 free((void *)p_mask);
649 free((void *)sh_proc_mask);
650 PetscFunctionReturn(PETSC_SUCCESS);
651 }
652
653 /******************************************************************************/
set_pairwise(PCTFS_gs_id * gs)654 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
655 {
656 PetscInt i, j;
657 PetscInt p_mask_size;
658 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
659 PetscInt *ngh_buf, *buf2;
660 PetscInt offset;
661 PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
662 PetscInt *pairwise_elm_list, len_pair_list = 0;
663 PetscInt *iptr, t1, i_start, nel, *elms;
664 PetscInt ct;
665
666 PetscFunctionBegin;
667 /* to make life easier */
668 nel = gs->nel;
669 elms = gs->elms;
670 ngh_buf = gs->ngh_buf;
671 sh_proc_mask = gs->pw_nghs;
672
673 /* need a few temp masks */
674 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
675 p_mask = (PetscInt *)malloc(p_mask_size);
676 tmp_proc_mask = (PetscInt *)malloc(p_mask_size);
677
678 /* set mask to my PCTFS_my_id's bit mask */
679 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id));
680
681 p_mask_size /= sizeof(PetscInt);
682
683 len_pair_list = gs->len_pw_list;
684 gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt));
685
686 /* how many processors (nghs) do we have to exchange with? */
687 nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt));
688
689 /* allocate space for PCTFS_gs_gop() info */
690 gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
691 gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs);
692 gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1));
693
694 /* init msg_size list */
695 PetscCall(PCTFS_ivec_zero(msg_size, nprs));
696
697 /* expand from bit mask list to int list */
698 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list));
699
700 /* keep list of elements being handled pairwise */
701 for (i = j = 0; i < nel; i++) {
702 if (elms[i] & TOP_BIT) {
703 elms[i] ^= TOP_BIT;
704 pairwise_elm_list[j++] = i;
705 }
706 }
707 pairwise_elm_list[j] = -1;
708
709 gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
710 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
711 gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1));
712 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
713 gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz);
714
715 /* find who goes to each processor */
716 for (i_start = i = 0; i < nprs; i++) {
717 /* processor i's mask */
718 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i]));
719
720 /* det # going to processor i */
721 for (ct = j = 0; j < len_pair_list; j++) {
722 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
723 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
724 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++;
725 }
726 msg_size[i] = ct;
727 i_start = PetscMax(i_start, ct);
728
729 /*space to hold nodes in message to first neighbor */
730 msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1));
731
732 for (j = 0; j < len_pair_list; j++) {
733 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size);
734 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size));
735 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j;
736 }
737 *iptr = -1;
738 }
739 msg_nodes[nprs] = NULL;
740
741 j = gs->loc_node_pairs = i_start;
742 t1 = GL_MAX;
743 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
744 gs->max_node_pairs = i_start;
745
746 i_start = j;
747 t1 = GL_MIN;
748 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
749 gs->min_node_pairs = i_start;
750
751 i_start = j;
752 t1 = GL_ADD;
753 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
754 gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1;
755
756 i_start = nprs;
757 t1 = GL_MAX;
758 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1));
759 gs->max_pairs = i_start;
760
761 /* remap pairwise in tail of gsi_via_bit_mask() */
762 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs);
763 gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
764 gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz);
765
766 /* reset malloc pool */
767 free((void *)p_mask);
768 free((void *)tmp_proc_mask);
769 PetscFunctionReturn(PETSC_SUCCESS);
770 }
771
772 /* to do pruned tree just save ngh buf copy for each one and decode here!
773 ******************************************************************************/
set_tree(PCTFS_gs_id * gs)774 static PetscErrorCode set_tree(PCTFS_gs_id *gs)
775 {
776 PetscInt i, j, n, nel;
777 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
778
779 PetscFunctionBegin;
780 /* local work ptrs */
781 elms = gs->elms;
782 nel = gs->nel;
783
784 /* how many via tree */
785 gs->tree_nel = n = ntree;
786 gs->tree_elms = tree_elms = iptr_in = tree_buf;
787 gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
788 gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz);
789 j = gs->tree_map_sz;
790 gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
791 gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1));
792
793 /* search the longer of the two lists */
794 /* note ... could save this info in get_ngh_buf and save searches */
795 if (n <= nel) {
796 /* bijective fct w/remap - search elm list */
797 for (i = 0; i < n; i++) {
798 if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) {
799 *iptr_in++ = j;
800 *iptr_out++ = i;
801 }
802 }
803 } else {
804 for (i = 0; i < nel; i++) {
805 if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) {
806 *iptr_in++ = i;
807 *iptr_out++ = j;
808 }
809 }
810 }
811
812 /* sentinel */
813 *iptr_in = *iptr_out = -1;
814 PetscFunctionReturn(PETSC_SUCCESS);
815 }
816
817 /******************************************************************************/
PCTFS_gs_gop_local_out(PCTFS_gs_id * gs,PetscScalar * vals)818 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
819 {
820 PetscInt *num, *map, **reduce;
821 PetscScalar tmp;
822
823 PetscFunctionBegin;
824 num = gs->num_gop_local_reduce;
825 reduce = gs->gop_local_reduce;
826 while ((map = *reduce++)) {
827 /* wall */
828 if (*num == 2) {
829 num++;
830 vals[map[1]] = vals[map[0]];
831 } else if (*num == 3) { /* corner shared by three elements */
832 num++;
833 vals[map[2]] = vals[map[1]] = vals[map[0]];
834 } else if (*num == 4) { /* corner shared by four elements */
835 num++;
836 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
837 } else { /* general case ... odd geoms ... 3D*/
838 num++;
839 tmp = *(vals + *map++);
840 while (*map >= 0) *(vals + *map++) = tmp;
841 }
842 }
843 PetscFunctionReturn(PETSC_SUCCESS);
844 }
845
846 /******************************************************************************/
PCTFS_gs_gop_local_plus(PCTFS_gs_id * gs,PetscScalar * vals)847 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
848 {
849 PetscInt *num, *map, **reduce;
850 PetscScalar tmp;
851
852 PetscFunctionBegin;
853 num = gs->num_local_reduce;
854 reduce = gs->local_reduce;
855 while ((map = *reduce)) {
856 /* wall */
857 if (*num == 2) {
858 num++;
859 reduce++;
860 vals[map[1]] = vals[map[0]] += vals[map[1]];
861 } else if (*num == 3) { /* corner shared by three elements */
862 num++;
863 reduce++;
864 vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]);
865 } else if (*num == 4) { /* corner shared by four elements */
866 num++;
867 reduce++;
868 vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
869 } else { /* general case ... odd geoms ... 3D*/
870 num++;
871 tmp = 0.0;
872 while (*map >= 0) tmp += *(vals + *map++);
873
874 map = *reduce++;
875 while (*map >= 0) *(vals + *map++) = tmp;
876 }
877 }
878 PetscFunctionReturn(PETSC_SUCCESS);
879 }
880
881 /******************************************************************************/
PCTFS_gs_gop_local_in_plus(PCTFS_gs_id * gs,PetscScalar * vals)882 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
883 {
884 PetscInt *num, *map, **reduce;
885 PetscScalar *base;
886
887 PetscFunctionBegin;
888 num = gs->num_gop_local_reduce;
889 reduce = gs->gop_local_reduce;
890 while ((map = *reduce++)) {
891 /* wall */
892 if (*num == 2) {
893 num++;
894 vals[map[0]] += vals[map[1]];
895 } else if (*num == 3) { /* corner shared by three elements */
896 num++;
897 vals[map[0]] += (vals[map[1]] + vals[map[2]]);
898 } else if (*num == 4) { /* corner shared by four elements */
899 num++;
900 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
901 } else { /* general case ... odd geoms ... 3D*/
902 num++;
903 base = vals + *map++;
904 while (*map >= 0) *base += *(vals + *map++);
905 }
906 }
907 PetscFunctionReturn(PETSC_SUCCESS);
908 }
909
910 /******************************************************************************/
PCTFS_gs_free(PCTFS_gs_id * gs)911 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
912 {
913 PetscInt i;
914
915 PetscFunctionBegin;
916 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm));
917 if (gs->nghs) free((void *)gs->nghs);
918 if (gs->pw_nghs) free((void *)gs->pw_nghs);
919
920 /* tree */
921 if (gs->max_left_over) {
922 if (gs->tree_elms) free((void *)gs->tree_elms);
923 if (gs->tree_buf) free((void *)gs->tree_buf);
924 if (gs->tree_work) free((void *)gs->tree_work);
925 if (gs->tree_map_in) free((void *)gs->tree_map_in);
926 if (gs->tree_map_out) free((void *)gs->tree_map_out);
927 }
928
929 /* pairwise info */
930 if (gs->num_pairs) {
931 /* should be NULL already */
932 if (gs->ngh_buf) free((void *)gs->ngh_buf);
933 if (gs->elms) free((void *)gs->elms);
934 if (gs->local_elms) free((void *)gs->local_elms);
935 if (gs->companion) free((void *)gs->companion);
936
937 /* only set if pairwise */
938 if (gs->vals) free((void *)gs->vals);
939 if (gs->in) free((void *)gs->in);
940 if (gs->out) free((void *)gs->out);
941 if (gs->msg_ids_in) free((void *)gs->msg_ids_in);
942 if (gs->msg_ids_out) free((void *)gs->msg_ids_out);
943 if (gs->pw_vals) free((void *)gs->pw_vals);
944 if (gs->pw_elm_list) free((void *)gs->pw_elm_list);
945 if (gs->node_list) {
946 for (i = 0; i < gs->num_pairs; i++) {
947 if (gs->node_list[i]) free((void *)gs->node_list[i]);
948 }
949 free((void *)gs->node_list);
950 }
951 if (gs->msg_sizes) free((void *)gs->msg_sizes);
952 if (gs->pair_list) free((void *)gs->pair_list);
953 }
954
955 /* local info */
956 if (gs->num_local_total >= 0) {
957 for (i = 0; i < gs->num_local_total + 1; i++) {
958 if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]);
959 }
960 }
961
962 /* if intersection tree/pairwise and local isn't empty */
963 if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce);
964 if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce);
965
966 free((void *)gs);
967 PetscFunctionReturn(PETSC_SUCCESS);
968 }
969
970 /******************************************************************************/
PCTFS_gs_gop_vec(PCTFS_gs_id * gs,PetscScalar * vals,const char * op,PetscInt step)971 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
972 {
973 PetscFunctionBegin;
974 switch (*op) {
975 case '+':
976 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
977 break;
978 default:
979 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0]));
980 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n"));
981 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step));
982 break;
983 }
984 PetscFunctionReturn(PETSC_SUCCESS);
985 }
986
987 /******************************************************************************/
PCTFS_gs_gop_vec_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)988 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
989 {
990 PetscFunctionBegin;
991 PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!");
992
993 /* local only operations!!! */
994 if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step));
995
996 /* if intersection tree/pairwise and local isn't empty */
997 if (gs->num_local_gop) {
998 PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step));
999
1000 /* pairwise */
1001 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1002
1003 /* tree */
1004 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1005
1006 PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step));
1007 } else { /* if intersection tree/pairwise and local is empty */
1008 /* pairwise */
1009 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step));
1010
1011 /* tree */
1012 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step));
1013 }
1014 PetscFunctionReturn(PETSC_SUCCESS);
1015 }
1016
1017 /******************************************************************************/
PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1018 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1019 {
1020 PetscInt *num, *map, **reduce;
1021 PetscScalar *base;
1022
1023 PetscFunctionBegin;
1024 num = gs->num_local_reduce;
1025 reduce = gs->local_reduce;
1026 while ((map = *reduce)) {
1027 base = vals + map[0] * step;
1028
1029 /* wall */
1030 if (*num == 2) {
1031 num++;
1032 reduce++;
1033 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1034 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1035 } else if (*num == 3) { /* corner shared by three elements */
1036 num++;
1037 reduce++;
1038 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1039 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1040 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1041 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1042 } else if (*num == 4) { /* corner shared by four elements */
1043 num++;
1044 reduce++;
1045 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1046 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1047 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1048 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1049 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1050 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1051 } else { /* general case ... odd geoms ... 3D */
1052 num++;
1053 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1054
1055 map = *reduce;
1056 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1057
1058 reduce++;
1059 }
1060 }
1061 PetscFunctionReturn(PETSC_SUCCESS);
1062 }
1063
1064 /******************************************************************************/
PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1065 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1066 {
1067 PetscInt *num, *map, **reduce;
1068 PetscScalar *base;
1069
1070 PetscFunctionBegin;
1071 num = gs->num_gop_local_reduce;
1072 reduce = gs->gop_local_reduce;
1073 while ((map = *reduce++)) {
1074 base = vals + map[0] * step;
1075
1076 /* wall */
1077 if (*num == 2) {
1078 num++;
1079 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1080 } else if (*num == 3) { /* corner shared by three elements */
1081 num++;
1082 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1083 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1084 } else if (*num == 4) { /* corner shared by four elements */
1085 num++;
1086 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step));
1087 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step));
1088 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step));
1089 } else { /* general case ... odd geoms ... 3D*/
1090 num++;
1091 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step));
1092 }
1093 }
1094 PetscFunctionReturn(PETSC_SUCCESS);
1095 }
1096
1097 /******************************************************************************/
PCTFS_gs_gop_vec_local_out(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1098 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1099 {
1100 PetscInt *num, *map, **reduce;
1101 PetscScalar *base;
1102
1103 PetscFunctionBegin;
1104 num = gs->num_gop_local_reduce;
1105 reduce = gs->gop_local_reduce;
1106 while ((map = *reduce++)) {
1107 base = vals + map[0] * step;
1108
1109 /* wall */
1110 if (*num == 2) {
1111 num++;
1112 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1113 } else if (*num == 3) { /* corner shared by three elements */
1114 num++;
1115 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1116 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1117 } else if (*num == 4) { /* corner shared by four elements */
1118 num++;
1119 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step));
1120 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step));
1121 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step));
1122 } else { /* general case ... odd geoms ... 3D*/
1123 num++;
1124 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step));
1125 }
1126 }
1127 PetscFunctionReturn(PETSC_SUCCESS);
1128 }
1129
1130 /******************************************************************************/
PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id * gs,PetscScalar * in_vals,PetscInt step)1131 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1132 {
1133 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1134 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1135 PetscInt *pw, *list, *size, **nodes;
1136 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1137 MPI_Status status;
1138 PetscBLASInt i1 = 1, dstep;
1139
1140 PetscFunctionBegin;
1141 /* strip and load s */
1142 msg_list = list = gs->pair_list;
1143 msg_size = size = gs->msg_sizes;
1144 msg_nodes = nodes = gs->node_list;
1145 iptr = pw = gs->pw_elm_list;
1146 dptr1 = dptr3 = gs->pw_vals;
1147 msg_ids_in = ids_in = gs->msg_ids_in;
1148 msg_ids_out = ids_out = gs->msg_ids_out;
1149 dptr2 = gs->out;
1150 in1 = in2 = gs->in;
1151
1152 /* post the receives */
1153 /* msg_nodes=nodes; */
1154 do {
1155 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1156 second one *list and do list++ afterwards */
1157 PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1158 list++;
1159 msg_ids_in++;
1160 in1 += *size++ * step;
1161 } while (*++msg_nodes);
1162 msg_nodes = nodes;
1163
1164 /* load gs values into in out gs buffers */
1165 while (*iptr >= 0) {
1166 PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step));
1167 dptr3 += step;
1168 iptr++;
1169 }
1170
1171 /* load out buffers and post the sends */
1172 while ((iptr = *msg_nodes++)) {
1173 dptr3 = dptr2;
1174 while (*iptr >= 0) {
1175 PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step));
1176 dptr2 += step;
1177 iptr++;
1178 }
1179 PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1180 msg_size++;
1181 msg_list++;
1182 msg_ids_out++;
1183 }
1184
1185 /* tree */
1186 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step));
1187
1188 /* process the received data */
1189 msg_nodes = nodes;
1190 while ((iptr = *nodes++)) {
1191 PetscScalar d1 = 1.0;
1192
1193 /* Should I check the return value of MPI_Wait() or status? */
1194 /* Can this loop be replaced by a call to MPI_Waitall()? */
1195 PetscCallMPI(MPI_Wait(ids_in, &status));
1196 ids_in++;
1197 while (*iptr >= 0) {
1198 PetscCall(PetscBLASIntCast(step, &dstep));
1199 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1));
1200 in2 += step;
1201 iptr++;
1202 }
1203 }
1204
1205 /* replace vals */
1206 while (*pw >= 0) {
1207 PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step));
1208 dptr1 += step;
1209 pw++;
1210 }
1211
1212 /* clear isend message handles */
1213 /* This changed for clarity though it could be the same */
1214
1215 /* Should I check the return value of MPI_Wait() or status? */
1216 /* Can this loop be replaced by a call to MPI_Waitall()? */
1217 while (*msg_nodes++) {
1218 PetscCallMPI(MPI_Wait(ids_out, &status));
1219 ids_out++;
1220 }
1221 PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223
1224 /******************************************************************************/
PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt step)1225 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1226 {
1227 PetscInt size, *in, *out;
1228 PetscScalar *buf, *work;
1229 PetscInt op[] = {GL_ADD, 0};
1230 PetscBLASInt i1 = 1;
1231 PetscBLASInt dstep;
1232
1233 PetscFunctionBegin;
1234 /* copy over to local variables */
1235 in = gs->tree_map_in;
1236 out = gs->tree_map_out;
1237 buf = gs->tree_buf;
1238 work = gs->tree_work;
1239 size = gs->tree_nel * step;
1240
1241 /* zero out collection buffer */
1242 PetscCall(PCTFS_rvec_zero(buf, size));
1243
1244 /* copy over my contributions */
1245 while (*in >= 0) {
1246 PetscCall(PetscBLASIntCast(step, &dstep));
1247 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1));
1248 }
1249
1250 /* perform fan in/out on full buffer */
1251 /* must change PCTFS_grop to handle the blas */
1252 PetscCall(PCTFS_grop(buf, work, size, op));
1253
1254 /* reset */
1255 in = gs->tree_map_in;
1256 out = gs->tree_map_out;
1257
1258 /* get the portion of the results I need */
1259 while (*in >= 0) {
1260 PetscCall(PetscBLASIntCast(step, &dstep));
1261 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1));
1262 }
1263 PetscFunctionReturn(PETSC_SUCCESS);
1264 }
1265
1266 /******************************************************************************/
PCTFS_gs_gop_hc(PCTFS_gs_id * gs,PetscScalar * vals,const char * op,PetscInt dim)1267 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1268 {
1269 PetscFunctionBegin;
1270 switch (*op) {
1271 case '+':
1272 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1273 break;
1274 default:
1275 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0]));
1276 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n"));
1277 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim));
1278 break;
1279 }
1280 PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282
1283 /******************************************************************************/
PCTFS_gs_gop_plus_hc(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt dim)1284 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1285 {
1286 PetscFunctionBegin;
1287 /* if there's nothing to do return */
1288 if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS);
1289
1290 /* can't do more dimensions then exist */
1291 dim = PetscMin(dim, PCTFS_i_log2_num_nodes);
1292
1293 /* local only operations!!! */
1294 if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals));
1295
1296 /* if intersection tree/pairwise and local isn't empty */
1297 if (gs->num_local_gop) {
1298 PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals));
1299
1300 /* pairwise will do tree inside ... */
1301 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */
1302 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1303
1304 PetscCall(PCTFS_gs_gop_local_out(gs, vals));
1305 } else { /* if intersection tree/pairwise and local is empty */
1306 /* pairwise will do tree inside */
1307 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */
1308 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim));
1309 }
1310 PetscFunctionReturn(PETSC_SUCCESS);
1311 }
1312
1313 /******************************************************************************/
PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id * gs,PetscScalar * in_vals,PetscInt dim)1314 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1315 {
1316 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1317 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1318 PetscInt *pw, *list, *size, **nodes;
1319 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1320 MPI_Status status;
1321 PetscInt i, mask = 1;
1322
1323 PetscFunctionBegin;
1324 for (i = 1; i < dim; i++) {
1325 mask <<= 1;
1326 mask++;
1327 }
1328
1329 /* strip and load s */
1330 msg_list = list = gs->pair_list;
1331 msg_size = size = gs->msg_sizes;
1332 msg_nodes = nodes = gs->node_list;
1333 iptr = pw = gs->pw_elm_list;
1334 dptr1 = dptr3 = gs->pw_vals;
1335 msg_ids_in = ids_in = gs->msg_ids_in;
1336 msg_ids_out = ids_out = gs->msg_ids_out;
1337 dptr2 = gs->out;
1338 in1 = in2 = gs->in;
1339
1340 /* post the receives */
1341 /* msg_nodes=nodes; */
1342 do {
1343 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1344 second one *list and do list++ afterwards */
1345 if ((PCTFS_my_id | mask) == (*list | mask)) {
1346 PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in));
1347 list++;
1348 msg_ids_in++;
1349 in1 += *size++;
1350 } else {
1351 list++;
1352 size++;
1353 }
1354 } while (*++msg_nodes);
1355
1356 /* load gs values into in out gs buffers */
1357 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1358
1359 /* load out buffers and post the sends */
1360 msg_nodes = nodes;
1361 list = msg_list;
1362 while ((iptr = *msg_nodes++)) {
1363 if ((PCTFS_my_id | mask) == (*list | mask)) {
1364 dptr3 = dptr2;
1365 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1366 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1367 /* is msg_ids_out++ correct? */
1368 PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out));
1369 msg_size++;
1370 list++;
1371 msg_ids_out++;
1372 } else {
1373 list++;
1374 msg_size++;
1375 }
1376 }
1377
1378 /* do the tree while we're waiting */
1379 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim));
1380
1381 /* process the received data */
1382 msg_nodes = nodes;
1383 list = msg_list;
1384 while ((iptr = *nodes++)) {
1385 if ((PCTFS_my_id | mask) == (*list | mask)) {
1386 /* Should I check the return value of MPI_Wait() or status? */
1387 /* Can this loop be replaced by a call to MPI_Waitall()? */
1388 PetscCallMPI(MPI_Wait(ids_in, &status));
1389 ids_in++;
1390 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1391 }
1392 list++;
1393 }
1394
1395 /* replace vals */
1396 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1397
1398 /* clear isend message handles */
1399 /* This changed for clarity though it could be the same */
1400 while (*msg_nodes++) {
1401 if ((PCTFS_my_id | mask) == (*msg_list | mask)) {
1402 /* Should I check the return value of MPI_Wait() or status? */
1403 /* Can this loop be replaced by a call to MPI_Waitall()? */
1404 PetscCallMPI(MPI_Wait(ids_out, &status));
1405 ids_out++;
1406 }
1407 msg_list++;
1408 }
1409 PetscFunctionReturn(PETSC_SUCCESS);
1410 }
1411
1412 /******************************************************************************/
PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id * gs,PetscScalar * vals,PetscInt dim)1413 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1414 {
1415 PetscInt size;
1416 PetscInt *in, *out;
1417 PetscScalar *buf, *work;
1418 PetscInt op[] = {GL_ADD, 0};
1419
1420 PetscFunctionBegin;
1421 in = gs->tree_map_in;
1422 out = gs->tree_map_out;
1423 buf = gs->tree_buf;
1424 work = gs->tree_work;
1425 size = gs->tree_nel;
1426
1427 PetscCall(PCTFS_rvec_zero(buf, size));
1428
1429 while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1430
1431 in = gs->tree_map_in;
1432 out = gs->tree_map_out;
1433
1434 PetscCall(PCTFS_grop_hc(buf, work, size, op, dim));
1435
1436 while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1437 PetscFunctionReturn(PETSC_SUCCESS);
1438 }
1439