1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 /***********************************gs.c*************************************** 30 Type: struct gather_scatter_id 31 ------------------------------ 32 33 ************************************gs.c**************************************/ 34 typedef struct gather_scatter_id { 35 PetscInt id; 36 PetscInt nel_min; 37 PetscInt nel_max; 38 PetscInt nel_sum; 39 PetscInt negl; 40 PetscInt gl_max; 41 PetscInt gl_min; 42 PetscInt repeats; 43 PetscInt ordered; 44 PetscInt positive; 45 PetscScalar *vals; 46 47 /* bit mask info */ 48 PetscInt *my_proc_mask; 49 PetscInt mask_sz; 50 PetscInt *ngh_buf; 51 PetscInt ngh_buf_sz; 52 PetscInt *nghs; 53 PetscInt num_nghs; 54 PetscInt max_nghs; 55 PetscInt *pw_nghs; 56 PetscInt num_pw_nghs; 57 PetscInt *tree_nghs; 58 PetscInt num_tree_nghs; 59 60 PetscInt num_loads; 61 62 /* repeats == true -> local info */ 63 PetscInt nel; /* number of unique elememts */ 64 PetscInt *elms; /* of size nel */ 65 PetscInt nel_total; 66 PetscInt *local_elms; /* of size nel_total */ 67 PetscInt *companion; /* of size nel_total */ 68 69 /* local info */ 70 PetscInt num_local_total; 71 PetscInt local_strength; 72 PetscInt num_local; 73 PetscInt *num_local_reduce; 74 PetscInt **local_reduce; 75 PetscInt num_local_gop; 76 PetscInt *num_gop_local_reduce; 77 PetscInt **gop_local_reduce; 78 79 /* pairwise info */ 80 PetscInt level; 81 PetscInt num_pairs; 82 PetscInt max_pairs; 83 PetscInt loc_node_pairs; 84 PetscInt max_node_pairs; 85 PetscInt min_node_pairs; 86 PetscInt avg_node_pairs; 87 PetscInt *pair_list; 88 PetscInt *msg_sizes; 89 PetscInt **node_list; 90 PetscInt len_pw_list; 91 PetscInt *pw_elm_list; 92 PetscScalar *pw_vals; 93 94 MPI_Request *msg_ids_in; 95 MPI_Request *msg_ids_out; 96 97 PetscScalar *out; 98 PetscScalar *in; 99 PetscInt msg_total; 100 101 /* tree - crystal accumulator info */ 102 PetscInt max_left_over; 103 PetscInt *pre; 104 PetscInt *in_num; 105 PetscInt *out_num; 106 PetscInt **in_list; 107 PetscInt **out_list; 108 109 /* new tree work*/ 110 PetscInt tree_nel; 111 PetscInt *tree_elms; 112 PetscScalar *tree_buf; 113 PetscScalar *tree_work; 114 115 PetscInt tree_map_sz; 116 PetscInt *tree_map_in; 117 PetscInt *tree_map_out; 118 119 /* current memory status */ 120 PetscInt gl_bss_min; 121 PetscInt gl_perm_min; 122 123 /* max segment size for PCTFS_gs_gop_vec() */ 124 PetscInt vec_sz; 125 126 /* hack to make paul happy */ 127 MPI_Comm PCTFS_gs_comm; 128 129 } PCTFS_gs_id; 130 131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 135 static PCTFS_gs_id *gsi_new(void); 136 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 137 138 /* same for all but vector flavor */ 139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 140 /* vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 151 152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 155 156 /* global vars */ 157 /* from comm.c module */ 158 159 static PetscInt num_gs_ids = 0; 160 161 /* should make this dynamic ... later */ 162 static PetscInt msg_buf = MAX_MSG_BUF; 163 static PetscInt vec_sz = GS_VEC_SZ; 164 static PetscInt *tree_buf = NULL; 165 static PetscInt tree_buf_sz = 0; 166 static PetscInt ntree = 0; 167 168 /***************************************************************************/ 169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) { 170 PetscFunctionBegin; 171 vec_sz = size; 172 PetscFunctionReturn(0); 173 } 174 175 /******************************************************************************/ 176 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) { 177 PetscFunctionBegin; 178 msg_buf = buf_size; 179 PetscFunctionReturn(0); 180 } 181 182 /******************************************************************************/ 183 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) { 184 PCTFS_gs_id *gs; 185 MPI_Group PCTFS_gs_group; 186 MPI_Comm PCTFS_gs_comm; 187 188 /* ensure that communication package has been initialized */ 189 PCTFS_comm_init(); 190 191 /* determines if we have enough dynamic/semi-static memory */ 192 /* checks input, allocs and sets gd_id template */ 193 gs = gsi_check_args(elms, nel, level); 194 195 /* only bit mask version up and working for the moment */ 196 /* LATER :: get int list version working for sparse pblms */ 197 PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs)); 198 199 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group)); 200 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm)); 201 PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group)); 202 203 gs->PCTFS_gs_comm = PCTFS_gs_comm; 204 205 return (gs); 206 } 207 208 /******************************************************************************/ 209 static PCTFS_gs_id *gsi_new(void) { 210 PCTFS_gs_id *gs; 211 gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id)); 212 PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id))); 213 return (gs); 214 } 215 216 /******************************************************************************/ 217 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) { 218 PetscInt i, j, k, t2; 219 PetscInt *companion, *elms, *unique, *iptr; 220 PetscInt num_local = 0, *num_to_reduce, **local_reduce; 221 PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND}; 222 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 223 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 224 PCTFS_gs_id *gs; 225 226 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n"); 227 if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n"); 228 229 if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n")); 230 231 /* get space for gs template */ 232 gs = gsi_new(); 233 gs->id = ++num_gs_ids; 234 235 /* hmt 6.4.99 */ 236 /* caller can set global ids that don't participate to 0 */ 237 /* PCTFS_gs_init ignores all zeros in elm list */ 238 /* negative global ids are still invalid */ 239 for (i = j = 0; i < nel; i++) { 240 if (in_elms[i] != 0) j++; 241 } 242 243 k = nel; 244 nel = j; 245 246 /* copy over in_elms list and create inverse map */ 247 elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt)); 248 companion = (PetscInt *)malloc(nel * sizeof(PetscInt)); 249 250 for (i = j = 0; i < k; i++) { 251 if (in_elms[i] != 0) { 252 elms[j] = in_elms[i]; 253 companion[j++] = i; 254 } 255 } 256 257 if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n"); 258 259 /* pre-pass ... check to see if sorted */ 260 elms[nel] = INT_MAX; 261 iptr = elms; 262 unique = elms + 1; 263 j = 0; 264 while (*iptr != INT_MAX) { 265 if (*iptr++ > *unique++) { 266 j = 1; 267 break; 268 } 269 } 270 271 /* set up inverse map */ 272 if (j) { 273 PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n")); 274 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER)); 275 } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n")); 276 elms[nel] = INT_MIN; 277 278 /* first pass */ 279 /* determine number of unique elements, check pd */ 280 for (i = k = 0; i < nel; i += j) { 281 t2 = elms[i]; 282 j = ++i; 283 284 /* clump 'em for now */ 285 while (elms[j] == t2) j++; 286 287 /* how many together and num local */ 288 if (j -= i) { 289 num_local++; 290 k += j; 291 } 292 } 293 294 /* how many unique elements? */ 295 gs->repeats = k; 296 gs->nel = nel - k; 297 298 /* number of repeats? */ 299 gs->num_local = num_local; 300 num_local += 2; 301 gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *)); 302 gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt)); 303 304 unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt)); 305 gs->elms = unique; 306 gs->nel_total = nel; 307 gs->local_elms = elms; 308 gs->companion = companion; 309 310 /* compess map as well as keep track of local ops */ 311 for (num_local = i = j = 0; i < gs->nel; i++) { 312 k = j; 313 t2 = unique[i] = elms[j]; 314 companion[i] = companion[j]; 315 316 while (elms[j] == t2) j++; 317 318 if ((t2 = (j - k)) > 1) { 319 /* number together */ 320 num_to_reduce[num_local] = t2++; 321 322 iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt)); 323 324 /* to use binary searching don't remap until we check intersection */ 325 *iptr++ = i; 326 327 /* note that we're skipping the first one */ 328 while (++k < j) *(iptr++) = companion[k]; 329 *iptr = -1; 330 } 331 } 332 333 /* sentinel for ngh_buf */ 334 unique[gs->nel] = INT_MAX; 335 336 /* for two partition sort hack */ 337 num_to_reduce[num_local] = 0; 338 local_reduce[num_local] = NULL; 339 num_to_reduce[++num_local] = 0; 340 local_reduce[num_local] = NULL; 341 342 /* load 'em up */ 343 /* note one extra to hold NON_UNIFORM flag!!! */ 344 vals[2] = vals[1] = vals[0] = nel; 345 if (gs->nel > 0) { 346 vals[3] = unique[0]; 347 vals[4] = unique[gs->nel - 1]; 348 } else { 349 vals[3] = INT_MAX; 350 vals[4] = INT_MIN; 351 } 352 vals[5] = level; 353 vals[6] = num_gs_ids; 354 355 /* GLOBAL: send 'em out */ 356 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs)); 357 358 /* must be semi-pos def - only pairwise depends on this */ 359 /* LATER - remove this restriction */ 360 if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n"); 361 if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n"); 362 363 gs->nel_min = vals[0]; 364 gs->nel_max = vals[1]; 365 gs->nel_sum = vals[2]; 366 gs->gl_min = vals[3]; 367 gs->gl_max = vals[4]; 368 gs->negl = vals[4] - vals[3] + 1; 369 370 if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl); 371 372 /* LATER :: add level == -1 -> program selects level */ 373 if (vals[5] < 0) vals[5] = 0; 374 else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes; 375 gs->level = vals[5]; 376 377 return (gs); 378 } 379 380 /******************************************************************************/ 381 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) { 382 PetscInt i, nel, *elms; 383 PetscInt t1; 384 PetscInt **reduce; 385 PetscInt *map; 386 387 PetscFunctionBegin; 388 /* totally local removes ... PCTFS_ct_bits == 0 */ 389 get_ngh_buf(gs); 390 391 if (gs->level) set_pairwise(gs); 392 if (gs->max_left_over) set_tree(gs); 393 394 /* intersection local and pairwise/tree? */ 395 gs->num_local_total = gs->num_local; 396 gs->gop_local_reduce = gs->local_reduce; 397 gs->num_gop_local_reduce = gs->num_local_reduce; 398 399 map = gs->companion; 400 401 /* is there any local compression */ 402 if (!gs->num_local) { 403 gs->local_strength = NONE; 404 gs->num_local_gop = 0; 405 } else { 406 /* ok find intersection */ 407 map = gs->companion; 408 reduce = gs->local_reduce; 409 for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) { 410 if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) { 411 t1++; 412 PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?"); 413 gs->num_local_reduce[i] *= -1; 414 } 415 **reduce = map[**reduce]; 416 } 417 418 /* intersection is empty */ 419 if (!t1) { 420 gs->local_strength = FULL; 421 gs->num_local_gop = 0; 422 } else { /* intersection not empty */ 423 gs->local_strength = PARTIAL; 424 425 PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR)); 426 427 gs->num_local_gop = t1; 428 gs->num_local_total = gs->num_local; 429 gs->num_local -= t1; 430 gs->gop_local_reduce = gs->local_reduce; 431 gs->num_gop_local_reduce = gs->num_local_reduce; 432 433 for (i = 0; i < t1; i++) { 434 PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?"); 435 gs->num_gop_local_reduce[i] *= -1; 436 gs->local_reduce++; 437 gs->num_local_reduce++; 438 } 439 gs->local_reduce++; 440 gs->num_local_reduce++; 441 } 442 } 443 444 elms = gs->pw_elm_list; 445 nel = gs->len_pw_list; 446 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 447 448 elms = gs->tree_map_in; 449 nel = gs->tree_map_sz; 450 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 451 452 /* clean up */ 453 free((void *)gs->local_elms); 454 free((void *)gs->companion); 455 free((void *)gs->elms); 456 free((void *)gs->ngh_buf); 457 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 458 PetscFunctionReturn(0); 459 } 460 461 /******************************************************************************/ 462 static PetscErrorCode place_in_tree(PetscInt elm) { 463 PetscInt *tp, n; 464 465 PetscFunctionBegin; 466 if (ntree == tree_buf_sz) { 467 if (tree_buf_sz) { 468 tp = tree_buf; 469 n = tree_buf_sz; 470 tree_buf_sz <<= 1; 471 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 472 PCTFS_ivec_copy(tree_buf, tp, n); 473 free(tp); 474 } else { 475 tree_buf_sz = TREE_BUF_SZ; 476 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 477 } 478 } 479 480 tree_buf[ntree++] = elm; 481 PetscFunctionReturn(0); 482 } 483 484 /******************************************************************************/ 485 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) { 486 PetscInt i, j, npw = 0, ntree_map = 0; 487 PetscInt p_mask_size, ngh_buf_size, buf_size; 488 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 489 PetscInt *ngh_buf, *buf1, *buf2; 490 PetscInt offset, per_load, num_loads, or_ct, start, end; 491 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 492 PetscInt oper = GL_B_OR; 493 PetscInt *ptr3, *t_mask, level, ct1, ct2; 494 495 PetscFunctionBegin; 496 /* to make life easier */ 497 nel = gs->nel; 498 elms = gs->elms; 499 level = gs->level; 500 501 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 502 p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes)); 503 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 504 505 /* allocate space for masks and info bufs */ 506 gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size); 507 gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size); 508 gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel; 509 t_mask = (PetscInt *)malloc(p_mask_size); 510 gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size); 511 512 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 513 /* had thought I could exploit rendezvous threshold */ 514 515 /* default is one pass */ 516 per_load = negl = gs->negl; 517 gs->num_loads = num_loads = 1; 518 i = p_mask_size * negl; 519 520 /* possible overflow on buffer size */ 521 /* overflow hack */ 522 if (i < 0) i = INT_MAX; 523 524 buf_size = PetscMin(msg_buf, i); 525 526 /* can we do it? */ 527 PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size); 528 529 /* get PCTFS_giop buf space ... make *only* one malloc */ 530 buf1 = (PetscInt *)malloc(buf_size << 1); 531 532 /* more than one gior exchange needed? */ 533 if (buf_size != i) { 534 per_load = buf_size / p_mask_size; 535 buf_size = per_load * p_mask_size; 536 gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0); 537 } 538 539 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 540 p_mask_size /= sizeof(PetscInt); 541 ngh_buf_size /= sizeof(PetscInt); 542 buf_size /= sizeof(PetscInt); 543 544 /* find PCTFS_giop work space */ 545 buf2 = buf1 + buf_size; 546 547 /* hold #ints needed for processor masks */ 548 gs->mask_sz = p_mask_size; 549 550 /* init buffers */ 551 PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size)); 552 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size)); 553 PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size)); 554 555 /* HACK reset tree info */ 556 tree_buf = NULL; 557 tree_buf_sz = ntree = 0; 558 559 /* ok do it */ 560 for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) { 561 /* identity for bitwise or is 000...000 */ 562 PCTFS_ivec_zero(buf1, buf_size); 563 564 /* load msg buffer */ 565 for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) { 566 offset = (offset - start) * p_mask_size; 567 PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size); 568 } 569 570 /* GLOBAL: pass buffer */ 571 PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper)); 572 573 /* unload buffer into ngh_buf */ 574 ptr2 = (elms + i_start); 575 for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) { 576 /* I own it ... may have to pairwise it */ 577 if (j == *ptr2) { 578 /* do i share it w/anyone? */ 579 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 580 /* guess not */ 581 if (ct1 < 2) { 582 ptr2++; 583 ptr1 += p_mask_size; 584 continue; 585 } 586 587 /* i do ... so keep info and turn off my bit */ 588 PCTFS_ivec_copy(ptr1, ptr3, p_mask_size); 589 PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size)); 590 PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size)); 591 592 /* is it to be done pairwise? */ 593 if (--ct1 <= level) { 594 npw++; 595 596 /* turn on high bit to indicate pw need to process */ 597 *ptr2++ |= TOP_BIT; 598 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size)); 599 ptr1 += p_mask_size; 600 continue; 601 } 602 603 /* get set for next and note that I have a tree contribution */ 604 /* could save exact elm index for tree here -> save a search */ 605 ptr2++; 606 ptr1 += p_mask_size; 607 ntree_map++; 608 } else { /* i don't but still might be involved in tree */ 609 610 /* shared by how many? */ 611 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 612 613 /* none! */ 614 if (ct1 < 2) continue; 615 616 /* is it going to be done pairwise? but not by me of course!*/ 617 if (--ct1 <= level) continue; 618 } 619 /* LATER we're going to have to process it NOW */ 620 /* nope ... tree it */ 621 PetscCall(place_in_tree(j)); 622 } 623 } 624 625 free((void *)t_mask); 626 free((void *)buf1); 627 628 gs->len_pw_list = npw; 629 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 630 631 /* expand from bit mask list to int list and save ngh list */ 632 gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt)); 633 PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs); 634 635 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt)); 636 637 oper = GL_MAX; 638 ct1 = gs->num_nghs; 639 PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper)); 640 gs->max_nghs = ct1; 641 642 gs->tree_map_sz = ntree_map; 643 gs->max_left_over = ntree; 644 645 free((void *)p_mask); 646 free((void *)sh_proc_mask); 647 PetscFunctionReturn(0); 648 } 649 650 /******************************************************************************/ 651 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) { 652 PetscInt i, j; 653 PetscInt p_mask_size; 654 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 655 PetscInt *ngh_buf, *buf2; 656 PetscInt offset; 657 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 658 PetscInt *pairwise_elm_list, len_pair_list = 0; 659 PetscInt *iptr, t1, i_start, nel, *elms; 660 PetscInt ct; 661 662 PetscFunctionBegin; 663 /* to make life easier */ 664 nel = gs->nel; 665 elms = gs->elms; 666 ngh_buf = gs->ngh_buf; 667 sh_proc_mask = gs->pw_nghs; 668 669 /* need a few temp masks */ 670 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 671 p_mask = (PetscInt *)malloc(p_mask_size); 672 tmp_proc_mask = (PetscInt *)malloc(p_mask_size); 673 674 /* set mask to my PCTFS_my_id's bit mask */ 675 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 676 677 p_mask_size /= sizeof(PetscInt); 678 679 len_pair_list = gs->len_pw_list; 680 gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt)); 681 682 /* how many processors (nghs) do we have to exchange with? */ 683 nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 684 685 /* allocate space for PCTFS_gs_gop() info */ 686 gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 687 gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 688 gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1)); 689 690 /* init msg_size list */ 691 PetscCall(PCTFS_ivec_zero(msg_size, nprs)); 692 693 /* expand from bit mask list to int list */ 694 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list)); 695 696 /* keep list of elements being handled pairwise */ 697 for (i = j = 0; i < nel; i++) { 698 if (elms[i] & TOP_BIT) { 699 elms[i] ^= TOP_BIT; 700 pairwise_elm_list[j++] = i; 701 } 702 } 703 pairwise_elm_list[j] = -1; 704 705 gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 706 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 707 gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 708 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 709 gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz); 710 711 /* find who goes to each processor */ 712 for (i_start = i = 0; i < nprs; i++) { 713 /* processor i's mask */ 714 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i])); 715 716 /* det # going to processor i */ 717 for (ct = j = 0; j < len_pair_list; j++) { 718 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 719 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 720 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++; 721 } 722 msg_size[i] = ct; 723 i_start = PetscMax(i_start, ct); 724 725 /*space to hold nodes in message to first neighbor */ 726 msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1)); 727 728 for (j = 0; j < len_pair_list; j++) { 729 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 730 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 731 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j; 732 } 733 *iptr = -1; 734 } 735 msg_nodes[nprs] = NULL; 736 737 j = gs->loc_node_pairs = i_start; 738 t1 = GL_MAX; 739 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 740 gs->max_node_pairs = i_start; 741 742 i_start = j; 743 t1 = GL_MIN; 744 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 745 gs->min_node_pairs = i_start; 746 747 i_start = j; 748 t1 = GL_ADD; 749 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 750 gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1; 751 752 i_start = nprs; 753 t1 = GL_MAX; 754 PCTFS_giop(&i_start, &offset, 1, &t1); 755 gs->max_pairs = i_start; 756 757 /* remap pairwise in tail of gsi_via_bit_mask() */ 758 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs); 759 gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 760 gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 761 762 /* reset malloc pool */ 763 free((void *)p_mask); 764 free((void *)tmp_proc_mask); 765 PetscFunctionReturn(0); 766 } 767 768 /* to do pruned tree just save ngh buf copy for each one and decode here! 769 ******************************************************************************/ 770 static PetscErrorCode set_tree(PCTFS_gs_id *gs) { 771 PetscInt i, j, n, nel; 772 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 773 774 PetscFunctionBegin; 775 /* local work ptrs */ 776 elms = gs->elms; 777 nel = gs->nel; 778 779 /* how many via tree */ 780 gs->tree_nel = n = ntree; 781 gs->tree_elms = tree_elms = iptr_in = tree_buf; 782 gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 783 gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 784 j = gs->tree_map_sz; 785 gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 786 gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 787 788 /* search the longer of the two lists */ 789 /* note ... could save this info in get_ngh_buf and save searches */ 790 if (n <= nel) { 791 /* bijective fct w/remap - search elm list */ 792 for (i = 0; i < n; i++) { 793 if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) { 794 *iptr_in++ = j; 795 *iptr_out++ = i; 796 } 797 } 798 } else { 799 for (i = 0; i < nel; i++) { 800 if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) { 801 *iptr_in++ = i; 802 *iptr_out++ = j; 803 } 804 } 805 } 806 807 /* sentinel */ 808 *iptr_in = *iptr_out = -1; 809 PetscFunctionReturn(0); 810 } 811 812 /******************************************************************************/ 813 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) { 814 PetscInt *num, *map, **reduce; 815 PetscScalar tmp; 816 817 PetscFunctionBegin; 818 num = gs->num_gop_local_reduce; 819 reduce = gs->gop_local_reduce; 820 while ((map = *reduce++)) { 821 /* wall */ 822 if (*num == 2) { 823 num++; 824 vals[map[1]] = vals[map[0]]; 825 } else if (*num == 3) { /* corner shared by three elements */ 826 num++; 827 vals[map[2]] = vals[map[1]] = vals[map[0]]; 828 } else if (*num == 4) { /* corner shared by four elements */ 829 num++; 830 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 831 } else { /* general case ... odd geoms ... 3D*/ 832 num++; 833 tmp = *(vals + *map++); 834 while (*map >= 0) *(vals + *map++) = tmp; 835 } 836 } 837 PetscFunctionReturn(0); 838 } 839 840 /******************************************************************************/ 841 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) { 842 PetscInt *num, *map, **reduce; 843 PetscScalar tmp; 844 845 PetscFunctionBegin; 846 num = gs->num_local_reduce; 847 reduce = gs->local_reduce; 848 while ((map = *reduce)) { 849 /* wall */ 850 if (*num == 2) { 851 num++; 852 reduce++; 853 vals[map[1]] = vals[map[0]] += vals[map[1]]; 854 } else if (*num == 3) { /* corner shared by three elements */ 855 num++; 856 reduce++; 857 vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]); 858 } else if (*num == 4) { /* corner shared by four elements */ 859 num++; 860 reduce++; 861 vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 862 } else { /* general case ... odd geoms ... 3D*/ 863 num++; 864 tmp = 0.0; 865 while (*map >= 0) tmp += *(vals + *map++); 866 867 map = *reduce++; 868 while (*map >= 0) *(vals + *map++) = tmp; 869 } 870 } 871 PetscFunctionReturn(0); 872 } 873 874 /******************************************************************************/ 875 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) { 876 PetscInt *num, *map, **reduce; 877 PetscScalar *base; 878 879 PetscFunctionBegin; 880 num = gs->num_gop_local_reduce; 881 reduce = gs->gop_local_reduce; 882 while ((map = *reduce++)) { 883 /* wall */ 884 if (*num == 2) { 885 num++; 886 vals[map[0]] += vals[map[1]]; 887 } else if (*num == 3) { /* corner shared by three elements */ 888 num++; 889 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 890 } else if (*num == 4) { /* corner shared by four elements */ 891 num++; 892 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 893 } else { /* general case ... odd geoms ... 3D*/ 894 num++; 895 base = vals + *map++; 896 while (*map >= 0) *base += *(vals + *map++); 897 } 898 } 899 PetscFunctionReturn(0); 900 } 901 902 /******************************************************************************/ 903 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) { 904 PetscInt i; 905 906 PetscFunctionBegin; 907 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm)); 908 if (gs->nghs) free((void *)gs->nghs); 909 if (gs->pw_nghs) free((void *)gs->pw_nghs); 910 911 /* tree */ 912 if (gs->max_left_over) { 913 if (gs->tree_elms) free((void *)gs->tree_elms); 914 if (gs->tree_buf) free((void *)gs->tree_buf); 915 if (gs->tree_work) free((void *)gs->tree_work); 916 if (gs->tree_map_in) free((void *)gs->tree_map_in); 917 if (gs->tree_map_out) free((void *)gs->tree_map_out); 918 } 919 920 /* pairwise info */ 921 if (gs->num_pairs) { 922 /* should be NULL already */ 923 if (gs->ngh_buf) free((void *)gs->ngh_buf); 924 if (gs->elms) free((void *)gs->elms); 925 if (gs->local_elms) free((void *)gs->local_elms); 926 if (gs->companion) free((void *)gs->companion); 927 928 /* only set if pairwise */ 929 if (gs->vals) free((void *)gs->vals); 930 if (gs->in) free((void *)gs->in); 931 if (gs->out) free((void *)gs->out); 932 if (gs->msg_ids_in) free((void *)gs->msg_ids_in); 933 if (gs->msg_ids_out) free((void *)gs->msg_ids_out); 934 if (gs->pw_vals) free((void *)gs->pw_vals); 935 if (gs->pw_elm_list) free((void *)gs->pw_elm_list); 936 if (gs->node_list) { 937 for (i = 0; i < gs->num_pairs; i++) { 938 if (gs->node_list[i]) { free((void *)gs->node_list[i]); } 939 } 940 free((void *)gs->node_list); 941 } 942 if (gs->msg_sizes) free((void *)gs->msg_sizes); 943 if (gs->pair_list) free((void *)gs->pair_list); 944 } 945 946 /* local info */ 947 if (gs->num_local_total >= 0) { 948 for (i = 0; i < gs->num_local_total + 1; i++) { 949 if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]); 950 } 951 } 952 953 /* if intersection tree/pairwise and local isn't empty */ 954 if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce); 955 if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce); 956 957 free((void *)gs); 958 PetscFunctionReturn(0); 959 } 960 961 /******************************************************************************/ 962 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) { 963 PetscFunctionBegin; 964 switch (*op) { 965 case '+': PCTFS_gs_gop_vec_plus(gs, vals, step); break; 966 default: 967 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0])); 968 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n")); 969 PCTFS_gs_gop_vec_plus(gs, vals, step); 970 break; 971 } 972 PetscFunctionReturn(0); 973 } 974 975 /******************************************************************************/ 976 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) { 977 PetscFunctionBegin; 978 PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 979 980 /* local only operations!!! */ 981 if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs, vals, step); 982 983 /* if intersection tree/pairwise and local isn't empty */ 984 if (gs->num_local_gop) { 985 PCTFS_gs_gop_vec_local_in_plus(gs, vals, step); 986 987 /* pairwise */ 988 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step); 989 990 /* tree */ 991 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step); 992 993 PCTFS_gs_gop_vec_local_out(gs, vals, step); 994 } else { /* if intersection tree/pairwise and local is empty */ 995 /* pairwise */ 996 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step); 997 998 /* tree */ 999 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, vals, step); 1000 } 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /******************************************************************************/ 1005 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) { 1006 PetscInt *num, *map, **reduce; 1007 PetscScalar *base; 1008 1009 PetscFunctionBegin; 1010 num = gs->num_local_reduce; 1011 reduce = gs->local_reduce; 1012 while ((map = *reduce)) { 1013 base = vals + map[0] * step; 1014 1015 /* wall */ 1016 if (*num == 2) { 1017 num++; 1018 reduce++; 1019 PCTFS_rvec_add(base, vals + map[1] * step, step); 1020 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1021 } else if (*num == 3) { /* corner shared by three elements */ 1022 num++; 1023 reduce++; 1024 PCTFS_rvec_add(base, vals + map[1] * step, step); 1025 PCTFS_rvec_add(base, vals + map[2] * step, step); 1026 PCTFS_rvec_copy(vals + map[2] * step, base, step); 1027 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1028 } else if (*num == 4) { /* corner shared by four elements */ 1029 num++; 1030 reduce++; 1031 PCTFS_rvec_add(base, vals + map[1] * step, step); 1032 PCTFS_rvec_add(base, vals + map[2] * step, step); 1033 PCTFS_rvec_add(base, vals + map[3] * step, step); 1034 PCTFS_rvec_copy(vals + map[3] * step, base, step); 1035 PCTFS_rvec_copy(vals + map[2] * step, base, step); 1036 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1037 } else { /* general case ... odd geoms ... 3D */ 1038 num++; 1039 while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step); 1040 1041 map = *reduce; 1042 while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step); 1043 1044 reduce++; 1045 } 1046 } 1047 PetscFunctionReturn(0); 1048 } 1049 1050 /******************************************************************************/ 1051 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) { 1052 PetscInt *num, *map, **reduce; 1053 PetscScalar *base; 1054 1055 PetscFunctionBegin; 1056 num = gs->num_gop_local_reduce; 1057 reduce = gs->gop_local_reduce; 1058 while ((map = *reduce++)) { 1059 base = vals + map[0] * step; 1060 1061 /* wall */ 1062 if (*num == 2) { 1063 num++; 1064 PCTFS_rvec_add(base, vals + map[1] * step, step); 1065 } else if (*num == 3) { /* corner shared by three elements */ 1066 num++; 1067 PCTFS_rvec_add(base, vals + map[1] * step, step); 1068 PCTFS_rvec_add(base, vals + map[2] * step, step); 1069 } else if (*num == 4) { /* corner shared by four elements */ 1070 num++; 1071 PCTFS_rvec_add(base, vals + map[1] * step, step); 1072 PCTFS_rvec_add(base, vals + map[2] * step, step); 1073 PCTFS_rvec_add(base, vals + map[3] * step, step); 1074 } else { /* general case ... odd geoms ... 3D*/ 1075 num++; 1076 while (*++map >= 0) PCTFS_rvec_add(base, vals + *map * step, step); 1077 } 1078 } 1079 PetscFunctionReturn(0); 1080 } 1081 1082 /******************************************************************************/ 1083 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) { 1084 PetscInt *num, *map, **reduce; 1085 PetscScalar *base; 1086 1087 PetscFunctionBegin; 1088 num = gs->num_gop_local_reduce; 1089 reduce = gs->gop_local_reduce; 1090 while ((map = *reduce++)) { 1091 base = vals + map[0] * step; 1092 1093 /* wall */ 1094 if (*num == 2) { 1095 num++; 1096 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1097 } else if (*num == 3) { /* corner shared by three elements */ 1098 num++; 1099 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1100 PCTFS_rvec_copy(vals + map[2] * step, base, step); 1101 } else if (*num == 4) { /* corner shared by four elements */ 1102 num++; 1103 PCTFS_rvec_copy(vals + map[1] * step, base, step); 1104 PCTFS_rvec_copy(vals + map[2] * step, base, step); 1105 PCTFS_rvec_copy(vals + map[3] * step, base, step); 1106 } else { /* general case ... odd geoms ... 3D*/ 1107 num++; 1108 while (*++map >= 0) PCTFS_rvec_copy(vals + *map * step, base, step); 1109 } 1110 } 1111 PetscFunctionReturn(0); 1112 } 1113 1114 /******************************************************************************/ 1115 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) { 1116 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1117 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1118 PetscInt *pw, *list, *size, **nodes; 1119 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1120 MPI_Status status; 1121 PetscBLASInt i1 = 1, dstep; 1122 1123 PetscFunctionBegin; 1124 /* strip and load s */ 1125 msg_list = list = gs->pair_list; 1126 msg_size = size = gs->msg_sizes; 1127 msg_nodes = nodes = gs->node_list; 1128 iptr = pw = gs->pw_elm_list; 1129 dptr1 = dptr3 = gs->pw_vals; 1130 msg_ids_in = ids_in = gs->msg_ids_in; 1131 msg_ids_out = ids_out = gs->msg_ids_out; 1132 dptr2 = gs->out; 1133 in1 = in2 = gs->in; 1134 1135 /* post the receives */ 1136 /* msg_nodes=nodes; */ 1137 do { 1138 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1139 second one *list and do list++ afterwards */ 1140 PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1141 list++; 1142 msg_ids_in++; 1143 in1 += *size++ * step; 1144 } while (*++msg_nodes); 1145 msg_nodes = nodes; 1146 1147 /* load gs values into in out gs buffers */ 1148 while (*iptr >= 0) { 1149 PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step); 1150 dptr3 += step; 1151 iptr++; 1152 } 1153 1154 /* load out buffers and post the sends */ 1155 while ((iptr = *msg_nodes++)) { 1156 dptr3 = dptr2; 1157 while (*iptr >= 0) { 1158 PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step); 1159 dptr2 += step; 1160 iptr++; 1161 } 1162 PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1163 msg_size++; 1164 msg_list++; 1165 msg_ids_out++; 1166 } 1167 1168 /* tree */ 1169 if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step); 1170 1171 /* process the received data */ 1172 msg_nodes = nodes; 1173 while ((iptr = *nodes++)) { 1174 PetscScalar d1 = 1.0; 1175 1176 /* Should I check the return value of MPI_Wait() or status? */ 1177 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1178 PetscCallMPI(MPI_Wait(ids_in, &status)); 1179 ids_in++; 1180 while (*iptr >= 0) { 1181 PetscCall(PetscBLASIntCast(step, &dstep)); 1182 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1)); 1183 in2 += step; 1184 iptr++; 1185 } 1186 } 1187 1188 /* replace vals */ 1189 while (*pw >= 0) { 1190 PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step); 1191 dptr1 += step; 1192 pw++; 1193 } 1194 1195 /* clear isend message handles */ 1196 /* This changed for clarity though it could be the same */ 1197 1198 /* Should I check the return value of MPI_Wait() or status? */ 1199 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1200 while (*msg_nodes++) { 1201 PetscCallMPI(MPI_Wait(ids_out, &status)); 1202 ids_out++; 1203 } 1204 PetscFunctionReturn(0); 1205 } 1206 1207 /******************************************************************************/ 1208 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) { 1209 PetscInt size, *in, *out; 1210 PetscScalar *buf, *work; 1211 PetscInt op[] = {GL_ADD, 0}; 1212 PetscBLASInt i1 = 1; 1213 PetscBLASInt dstep; 1214 1215 PetscFunctionBegin; 1216 /* copy over to local variables */ 1217 in = gs->tree_map_in; 1218 out = gs->tree_map_out; 1219 buf = gs->tree_buf; 1220 work = gs->tree_work; 1221 size = gs->tree_nel * step; 1222 1223 /* zero out collection buffer */ 1224 PCTFS_rvec_zero(buf, size); 1225 1226 /* copy over my contributions */ 1227 while (*in >= 0) { 1228 PetscCall(PetscBLASIntCast(step, &dstep)); 1229 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1)); 1230 } 1231 1232 /* perform fan in/out on full buffer */ 1233 /* must change PCTFS_grop to handle the blas */ 1234 PCTFS_grop(buf, work, size, op); 1235 1236 /* reset */ 1237 in = gs->tree_map_in; 1238 out = gs->tree_map_out; 1239 1240 /* get the portion of the results I need */ 1241 while (*in >= 0) { 1242 PetscCall(PetscBLASIntCast(step, &dstep)); 1243 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1)); 1244 } 1245 PetscFunctionReturn(0); 1246 } 1247 1248 /******************************************************************************/ 1249 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) { 1250 PetscFunctionBegin; 1251 switch (*op) { 1252 case '+': PCTFS_gs_gop_plus_hc(gs, vals, dim); break; 1253 default: 1254 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0])); 1255 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n")); 1256 PCTFS_gs_gop_plus_hc(gs, vals, dim); 1257 break; 1258 } 1259 PetscFunctionReturn(0); 1260 } 1261 1262 /******************************************************************************/ 1263 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) { 1264 PetscFunctionBegin; 1265 /* if there's nothing to do return */ 1266 if (dim <= 0) PetscFunctionReturn(0); 1267 1268 /* can't do more dimensions then exist */ 1269 dim = PetscMin(dim, PCTFS_i_log2_num_nodes); 1270 1271 /* local only operations!!! */ 1272 if (gs->num_local) PCTFS_gs_gop_local_plus(gs, vals); 1273 1274 /* if intersection tree/pairwise and local isn't empty */ 1275 if (gs->num_local_gop) { 1276 PCTFS_gs_gop_local_in_plus(gs, vals); 1277 1278 /* pairwise will do tree inside ... */ 1279 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree only */ 1280 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim); 1281 1282 PCTFS_gs_gop_local_out(gs, vals); 1283 } else { /* if intersection tree/pairwise and local is empty */ 1284 /* pairwise will do tree inside */ 1285 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim); /* tree */ 1286 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, vals, dim); 1287 } 1288 PetscFunctionReturn(0); 1289 } 1290 1291 /******************************************************************************/ 1292 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) { 1293 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1294 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1295 PetscInt *pw, *list, *size, **nodes; 1296 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1297 MPI_Status status; 1298 PetscInt i, mask = 1; 1299 1300 PetscFunctionBegin; 1301 for (i = 1; i < dim; i++) { 1302 mask <<= 1; 1303 mask++; 1304 } 1305 1306 /* strip and load s */ 1307 msg_list = list = gs->pair_list; 1308 msg_size = size = gs->msg_sizes; 1309 msg_nodes = nodes = gs->node_list; 1310 iptr = pw = gs->pw_elm_list; 1311 dptr1 = dptr3 = gs->pw_vals; 1312 msg_ids_in = ids_in = gs->msg_ids_in; 1313 msg_ids_out = ids_out = gs->msg_ids_out; 1314 dptr2 = gs->out; 1315 in1 = in2 = gs->in; 1316 1317 /* post the receives */ 1318 /* msg_nodes=nodes; */ 1319 do { 1320 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1321 second one *list and do list++ afterwards */ 1322 if ((PCTFS_my_id | mask) == (*list | mask)) { 1323 PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1324 list++; 1325 msg_ids_in++; 1326 in1 += *size++; 1327 } else { 1328 list++; 1329 size++; 1330 } 1331 } while (*++msg_nodes); 1332 1333 /* load gs values into in out gs buffers */ 1334 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1335 1336 /* load out buffers and post the sends */ 1337 msg_nodes = nodes; 1338 list = msg_list; 1339 while ((iptr = *msg_nodes++)) { 1340 if ((PCTFS_my_id | mask) == (*list | mask)) { 1341 dptr3 = dptr2; 1342 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1343 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1344 /* is msg_ids_out++ correct? */ 1345 PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1346 msg_size++; 1347 list++; 1348 msg_ids_out++; 1349 } else { 1350 list++; 1351 msg_size++; 1352 } 1353 } 1354 1355 /* do the tree while we're waiting */ 1356 if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim); 1357 1358 /* process the received data */ 1359 msg_nodes = nodes; 1360 list = msg_list; 1361 while ((iptr = *nodes++)) { 1362 if ((PCTFS_my_id | mask) == (*list | mask)) { 1363 /* Should I check the return value of MPI_Wait() or status? */ 1364 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1365 PetscCallMPI(MPI_Wait(ids_in, &status)); 1366 ids_in++; 1367 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1368 } 1369 list++; 1370 } 1371 1372 /* replace vals */ 1373 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1374 1375 /* clear isend message handles */ 1376 /* This changed for clarity though it could be the same */ 1377 while (*msg_nodes++) { 1378 if ((PCTFS_my_id | mask) == (*msg_list | mask)) { 1379 /* Should I check the return value of MPI_Wait() or status? */ 1380 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1381 PetscCallMPI(MPI_Wait(ids_out, &status)); 1382 ids_out++; 1383 } 1384 msg_list++; 1385 } 1386 PetscFunctionReturn(0); 1387 } 1388 1389 /******************************************************************************/ 1390 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) { 1391 PetscInt size; 1392 PetscInt *in, *out; 1393 PetscScalar *buf, *work; 1394 PetscInt op[] = {GL_ADD, 0}; 1395 1396 PetscFunctionBegin; 1397 in = gs->tree_map_in; 1398 out = gs->tree_map_out; 1399 buf = gs->tree_buf; 1400 work = gs->tree_work; 1401 size = gs->tree_nel; 1402 1403 PCTFS_rvec_zero(buf, size); 1404 1405 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1406 1407 in = gs->tree_map_in; 1408 out = gs->tree_map_out; 1409 1410 PCTFS_grop_hc(buf, work, size, op, dim); 1411 1412 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1413 PetscFunctionReturn(0); 1414 } 1415