1 /* 2 3 Author: Henry M. Tufo III 4 5 e-mail: hmt@cs.brown.edu 6 7 snail-mail: 8 Division of Applied Mathematics 9 Brown University 10 Providence, RI 02912 11 12 Last Modification: 13 6.21.97 14 15 File Description: 16 ----------------- 17 18 */ 19 20 #include <../src/ksp/pc/impls/tfs/tfs.h> 21 22 /* default length of number of items via tree - doubles if exceeded */ 23 #define TREE_BUF_SZ 2048 24 #define GS_VEC_SZ 1 25 26 /* 27 Type: struct gather_scatter_id 28 */ 29 typedef struct gather_scatter_id { 30 PetscInt id; 31 PetscInt nel_min; 32 PetscInt nel_max; 33 PetscInt nel_sum; 34 PetscInt negl; 35 PetscInt gl_max; 36 PetscInt gl_min; 37 PetscInt repeats; 38 PetscInt ordered; 39 PetscInt positive; 40 PetscScalar *vals; 41 42 /* bit mask info */ 43 PetscInt *my_proc_mask; 44 PetscInt mask_sz; 45 PetscInt *ngh_buf; 46 PetscInt ngh_buf_sz; 47 PetscInt *nghs; 48 PetscInt num_nghs; 49 PetscInt max_nghs; 50 PetscInt *pw_nghs; 51 PetscInt num_pw_nghs; 52 PetscInt *tree_nghs; 53 PetscInt num_tree_nghs; 54 55 PetscInt num_loads; 56 57 /* repeats == true -> local info */ 58 PetscInt nel; /* number of unique elements */ 59 PetscInt *elms; /* of size nel */ 60 PetscInt nel_total; 61 PetscInt *local_elms; /* of size nel_total */ 62 PetscInt *companion; /* of size nel_total */ 63 64 /* local info */ 65 PetscInt num_local_total; 66 PetscInt local_strength; 67 PetscInt num_local; 68 PetscInt *num_local_reduce; 69 PetscInt **local_reduce; 70 PetscInt num_local_gop; 71 PetscInt *num_gop_local_reduce; 72 PetscInt **gop_local_reduce; 73 74 /* pairwise info */ 75 PetscInt level; 76 PetscInt num_pairs; 77 PetscInt max_pairs; 78 PetscInt loc_node_pairs; 79 PetscInt max_node_pairs; 80 PetscInt min_node_pairs; 81 PetscInt avg_node_pairs; 82 PetscInt *pair_list; 83 PetscInt *msg_sizes; 84 PetscInt **node_list; 85 PetscInt len_pw_list; 86 PetscInt *pw_elm_list; 87 PetscScalar *pw_vals; 88 89 MPI_Request *msg_ids_in; 90 MPI_Request *msg_ids_out; 91 92 PetscScalar *out; 93 PetscScalar *in; 94 PetscInt msg_total; 95 96 /* tree - crystal accumulator info */ 97 PetscInt max_left_over; 98 PetscInt *pre; 99 PetscInt *in_num; 100 PetscInt *out_num; 101 PetscInt **in_list; 102 PetscInt **out_list; 103 104 /* new tree work*/ 105 PetscInt tree_nel; 106 PetscInt *tree_elms; 107 PetscScalar *tree_buf; 108 PetscScalar *tree_work; 109 110 PetscInt tree_map_sz; 111 PetscInt *tree_map_in; 112 PetscInt *tree_map_out; 113 114 /* current memory status */ 115 PetscInt gl_bss_min; 116 PetscInt gl_perm_min; 117 118 /* max segment size for PCTFS_gs_gop_vec() */ 119 PetscInt vec_sz; 120 121 /* hack to make paul happy */ 122 MPI_Comm PCTFS_gs_comm; 123 124 } PCTFS_gs_id; 125 126 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 127 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 128 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 129 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 130 static PCTFS_gs_id *gsi_new(void); 131 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 132 133 /* same for all but vector flavor */ 134 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 135 /* vector flavor */ 136 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 137 138 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 139 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 140 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 141 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 143 144 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 145 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 146 147 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 148 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 149 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 150 151 /* global vars */ 152 /* from comm.c module */ 153 154 static PetscInt num_gs_ids = 0; 155 156 /* should make this dynamic ... later */ 157 static PetscInt msg_buf = MAX_MSG_BUF; 158 static PetscInt vec_sz = GS_VEC_SZ; 159 static PetscInt *tree_buf = NULL; 160 static PetscInt tree_buf_sz = 0; 161 static PetscInt ntree = 0; 162 163 /***************************************************************************/ 164 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 165 { 166 PetscFunctionBegin; 167 vec_sz = size; 168 PetscFunctionReturn(PETSC_SUCCESS); 169 } 170 171 /******************************************************************************/ 172 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 173 { 174 PetscFunctionBegin; 175 msg_buf = buf_size; 176 PetscFunctionReturn(PETSC_SUCCESS); 177 } 178 179 /******************************************************************************/ 180 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 181 { 182 PCTFS_gs_id *gs; 183 MPI_Group PCTFS_gs_group; 184 MPI_Comm PCTFS_gs_comm; 185 186 /* ensure that communication package has been initialized */ 187 PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init()); 188 189 /* determines if we have enough dynamic/semi-static memory */ 190 /* checks input, allocs and sets gd_id template */ 191 gs = gsi_check_args(elms, nel, level); 192 193 /* only bit mask version up and working for the moment */ 194 /* LATER :: get int list version working for sparse pblms */ 195 PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs)); 196 197 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 198 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS); 199 PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 200 201 gs->PCTFS_gs_comm = PCTFS_gs_comm; 202 203 return gs; 204 } 205 206 /******************************************************************************/ 207 static PCTFS_gs_id *gsi_new(void) 208 { 209 PCTFS_gs_id *gs; 210 gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id)); 211 PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id))); 212 return gs; 213 } 214 215 /******************************************************************************/ 216 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 217 { 218 PetscInt i, j, k, t2; 219 PetscInt *companion, *elms, *unique, *iptr; 220 PetscInt num_local = 0, *num_to_reduce, **local_reduce; 221 PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND}; 222 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 223 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 224 PCTFS_gs_id *gs; 225 226 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n"); 227 if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n"); 228 229 if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n")); 230 231 /* get space for gs template */ 232 gs = gsi_new(); 233 gs->id = ++num_gs_ids; 234 235 /* hmt 6.4.99 */ 236 /* caller can set global ids that don't participate to 0 */ 237 /* PCTFS_gs_init ignores all zeros in elm list */ 238 /* negative global ids are still invalid */ 239 for (i = j = 0; i < nel; i++) { 240 if (in_elms[i] != 0) j++; 241 } 242 243 k = nel; 244 nel = j; 245 246 /* copy over in_elms list and create inverse map */ 247 elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt)); 248 companion = (PetscInt *)malloc(nel * sizeof(PetscInt)); 249 250 for (i = j = 0; i < k; i++) { 251 if (in_elms[i] != 0) { 252 elms[j] = in_elms[i]; 253 companion[j++] = i; 254 } 255 } 256 257 if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n"); 258 259 /* pre-pass ... check to see if sorted */ 260 elms[nel] = INT_MAX; 261 iptr = elms; 262 unique = elms + 1; 263 j = 0; 264 while (*iptr != INT_MAX) { 265 if (*iptr++ > *unique++) { 266 j = 1; 267 break; 268 } 269 } 270 271 /* set up inverse map */ 272 if (j) { 273 PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n")); 274 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER)); 275 } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n")); 276 elms[nel] = INT_MIN; 277 278 /* first pass */ 279 /* determine number of unique elements, check pd */ 280 for (i = k = 0; i < nel; i += j) { 281 t2 = elms[i]; 282 j = ++i; 283 284 /* clump 'em for now */ 285 while (elms[j] == t2) j++; 286 287 /* how many together and num local */ 288 if (j -= i) { 289 num_local++; 290 k += j; 291 } 292 } 293 294 /* how many unique elements? */ 295 gs->repeats = k; 296 gs->nel = nel - k; 297 298 /* number of repeats? */ 299 gs->num_local = num_local; 300 num_local += 2; 301 gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *)); 302 gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt)); 303 304 unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt)); 305 gs->elms = unique; 306 gs->nel_total = nel; 307 gs->local_elms = elms; 308 gs->companion = companion; 309 310 /* compess map as well as keep track of local ops */ 311 for (num_local = i = j = 0; i < gs->nel; i++) { 312 k = j; 313 t2 = unique[i] = elms[j]; 314 companion[i] = companion[j]; 315 316 while (elms[j] == t2) j++; 317 318 if ((t2 = (j - k)) > 1) { 319 /* number together */ 320 num_to_reduce[num_local] = t2++; 321 322 iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt)); 323 324 /* to use binary searching don't remap until we check intersection */ 325 *iptr++ = i; 326 327 /* note that we're skipping the first one */ 328 while (++k < j) *(iptr++) = companion[k]; 329 *iptr = -1; 330 } 331 } 332 333 /* sentinel for ngh_buf */ 334 unique[gs->nel] = INT_MAX; 335 336 /* for two partition sort hack */ 337 num_to_reduce[num_local] = 0; 338 local_reduce[num_local] = NULL; 339 num_to_reduce[++num_local] = 0; 340 local_reduce[num_local] = NULL; 341 342 /* load 'em up */ 343 /* note one extra to hold NON_UNIFORM flag!!! */ 344 vals[2] = vals[1] = vals[0] = nel; 345 if (gs->nel > 0) { 346 vals[3] = unique[0]; 347 vals[4] = unique[gs->nel - 1]; 348 } else { 349 vals[3] = INT_MAX; 350 vals[4] = INT_MIN; 351 } 352 vals[5] = level; 353 vals[6] = num_gs_ids; 354 355 /* GLOBAL: send 'em out */ 356 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs)); 357 358 /* must be semi-pos def - only pairwise depends on this */ 359 /* LATER - remove this restriction */ 360 if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n"); 361 if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n"); 362 363 gs->nel_min = vals[0]; 364 gs->nel_max = vals[1]; 365 gs->nel_sum = vals[2]; 366 gs->gl_min = vals[3]; 367 gs->gl_max = vals[4]; 368 gs->negl = vals[4] - vals[3] + 1; 369 370 if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl); 371 372 /* LATER :: add level == -1 -> program selects level */ 373 if (vals[5] < 0) vals[5] = 0; 374 else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes; 375 gs->level = vals[5]; 376 377 return gs; 378 } 379 380 /******************************************************************************/ 381 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 382 { 383 PetscInt i, nel, *elms; 384 PetscInt t1; 385 PetscInt **reduce; 386 PetscInt *map; 387 388 PetscFunctionBegin; 389 /* totally local removes ... PCTFS_ct_bits == 0 */ 390 PetscCall(get_ngh_buf(gs)); 391 392 if (gs->level) PetscCall(set_pairwise(gs)); 393 if (gs->max_left_over) PetscCall(set_tree(gs)); 394 395 /* intersection local and pairwise/tree? */ 396 gs->num_local_total = gs->num_local; 397 gs->gop_local_reduce = gs->local_reduce; 398 gs->num_gop_local_reduce = gs->num_local_reduce; 399 400 map = gs->companion; 401 402 /* is there any local compression */ 403 if (!gs->num_local) { 404 gs->local_strength = NONE; 405 gs->num_local_gop = 0; 406 } else { 407 /* ok find intersection */ 408 map = gs->companion; 409 reduce = gs->local_reduce; 410 for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) { 411 if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) { 412 t1++; 413 PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?"); 414 gs->num_local_reduce[i] *= -1; 415 } 416 **reduce = map[**reduce]; 417 } 418 419 /* intersection is empty */ 420 if (!t1) { 421 gs->local_strength = FULL; 422 gs->num_local_gop = 0; 423 } else { /* intersection not empty */ 424 gs->local_strength = PARTIAL; 425 426 PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR)); 427 428 gs->num_local_gop = t1; 429 gs->num_local_total = gs->num_local; 430 gs->num_local -= t1; 431 gs->gop_local_reduce = gs->local_reduce; 432 gs->num_gop_local_reduce = gs->num_local_reduce; 433 434 for (i = 0; i < t1; i++) { 435 PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?"); 436 gs->num_gop_local_reduce[i] *= -1; 437 gs->local_reduce++; 438 gs->num_local_reduce++; 439 } 440 gs->local_reduce++; 441 gs->num_local_reduce++; 442 } 443 } 444 445 elms = gs->pw_elm_list; 446 nel = gs->len_pw_list; 447 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 448 449 elms = gs->tree_map_in; 450 nel = gs->tree_map_sz; 451 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 452 453 /* clean up */ 454 free((void *)gs->local_elms); 455 free((void *)gs->companion); 456 free((void *)gs->elms); 457 free((void *)gs->ngh_buf); 458 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 459 PetscFunctionReturn(PETSC_SUCCESS); 460 } 461 462 /******************************************************************************/ 463 static PetscErrorCode place_in_tree(PetscInt elm) 464 { 465 PetscInt *tp, n; 466 467 PetscFunctionBegin; 468 if (ntree == tree_buf_sz) { 469 if (tree_buf_sz) { 470 tp = tree_buf; 471 n = tree_buf_sz; 472 tree_buf_sz <<= 1; 473 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 474 PCTFS_ivec_copy(tree_buf, tp, n); 475 free(tp); 476 } else { 477 tree_buf_sz = TREE_BUF_SZ; 478 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 479 } 480 } 481 482 tree_buf[ntree++] = elm; 483 PetscFunctionReturn(PETSC_SUCCESS); 484 } 485 486 /******************************************************************************/ 487 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 488 { 489 PetscInt i, j, npw = 0, ntree_map = 0; 490 PetscInt p_mask_size, ngh_buf_size, buf_size; 491 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 492 PetscInt *ngh_buf, *buf1, *buf2; 493 PetscInt offset, per_load, num_loads, or_ct, start, end; 494 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 495 PetscInt oper = GL_B_OR; 496 PetscInt *ptr3, *t_mask, level, ct1, ct2; 497 498 PetscFunctionBegin; 499 /* to make life easier */ 500 nel = gs->nel; 501 elms = gs->elms; 502 level = gs->level; 503 504 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 505 p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes)); 506 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 507 508 /* allocate space for masks and info bufs */ 509 gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size); 510 gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size); 511 gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel; 512 t_mask = (PetscInt *)malloc(p_mask_size); 513 gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size); 514 515 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 516 /* had thought I could exploit rendezvous threshold */ 517 518 /* default is one pass */ 519 per_load = negl = gs->negl; 520 gs->num_loads = num_loads = 1; 521 i = p_mask_size * negl; 522 523 /* possible overflow on buffer size */ 524 /* overflow hack */ 525 if (i < 0) i = INT_MAX; 526 527 buf_size = PetscMin(msg_buf, i); 528 529 /* can we do it? */ 530 PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size); 531 532 /* get PCTFS_giop buf space ... make *only* one malloc */ 533 buf1 = (PetscInt *)malloc(buf_size << 1); 534 535 /* more than one gior exchange needed? */ 536 if (buf_size != i) { 537 per_load = buf_size / p_mask_size; 538 buf_size = per_load * p_mask_size; 539 gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0); 540 } 541 542 /* convert buf sizes from #bytes to #ints - 32-bit only! */ 543 p_mask_size /= sizeof(PetscInt); 544 ngh_buf_size /= sizeof(PetscInt); 545 buf_size /= sizeof(PetscInt); 546 547 /* find PCTFS_giop work space */ 548 buf2 = buf1 + buf_size; 549 550 /* hold #ints needed for processor masks */ 551 gs->mask_sz = p_mask_size; 552 553 /* init buffers */ 554 PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size)); 555 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size)); 556 PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size)); 557 558 /* HACK reset tree info */ 559 tree_buf = NULL; 560 tree_buf_sz = ntree = 0; 561 562 /* ok do it */ 563 for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) { 564 /* identity for bitwise or is 000...000 */ 565 PetscCall(PCTFS_ivec_zero(buf1, buf_size)); 566 567 /* load msg buffer */ 568 for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) { 569 offset = (offset - start) * p_mask_size; 570 PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size); 571 } 572 573 /* GLOBAL: pass buffer */ 574 PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper)); 575 576 /* unload buffer into ngh_buf */ 577 ptr2 = (elms + i_start); 578 for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) { 579 /* I own it ... may have to pairwise it */ 580 if (j == *ptr2) { 581 /* do i share it w/anyone? */ 582 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 583 /* guess not */ 584 if (ct1 < 2) { 585 ptr2++; 586 ptr1 += p_mask_size; 587 continue; 588 } 589 590 /* i do ... so keep info and turn off my bit */ 591 PCTFS_ivec_copy(ptr1, ptr3, p_mask_size); 592 PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size)); 593 PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size)); 594 595 /* is it to be done pairwise? */ 596 if (--ct1 <= level) { 597 npw++; 598 599 /* turn on high bit to indicate pw need to process */ 600 *ptr2++ |= TOP_BIT; 601 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size)); 602 ptr1 += p_mask_size; 603 continue; 604 } 605 606 /* get set for next and note that I have a tree contribution */ 607 /* could save exact elm index for tree here -> save a search */ 608 ptr2++; 609 ptr1 += p_mask_size; 610 ntree_map++; 611 } else { /* i don't but still might be involved in tree */ 612 613 /* shared by how many? */ 614 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 615 616 /* none! */ 617 if (ct1 < 2) continue; 618 619 /* is it going to be done pairwise? but not by me of course!*/ 620 if (--ct1 <= level) continue; 621 } 622 /* LATER we're going to have to process it NOW */ 623 /* nope ... tree it */ 624 PetscCall(place_in_tree(j)); 625 } 626 } 627 628 free((void *)t_mask); 629 free((void *)buf1); 630 631 gs->len_pw_list = npw; 632 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 633 634 /* expand from bit mask list to int list and save ngh list */ 635 gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt)); 636 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs)); 637 638 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt)); 639 640 oper = GL_MAX; 641 ct1 = gs->num_nghs; 642 PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper)); 643 gs->max_nghs = ct1; 644 645 gs->tree_map_sz = ntree_map; 646 gs->max_left_over = ntree; 647 648 free((void *)p_mask); 649 free((void *)sh_proc_mask); 650 PetscFunctionReturn(PETSC_SUCCESS); 651 } 652 653 /******************************************************************************/ 654 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 655 { 656 PetscInt i, j; 657 PetscInt p_mask_size; 658 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 659 PetscInt *ngh_buf, *buf2; 660 PetscInt offset; 661 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 662 PetscInt *pairwise_elm_list, len_pair_list = 0; 663 PetscInt *iptr, t1, i_start, nel, *elms; 664 PetscInt ct; 665 666 PetscFunctionBegin; 667 /* to make life easier */ 668 nel = gs->nel; 669 elms = gs->elms; 670 ngh_buf = gs->ngh_buf; 671 sh_proc_mask = gs->pw_nghs; 672 673 /* need a few temp masks */ 674 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 675 p_mask = (PetscInt *)malloc(p_mask_size); 676 tmp_proc_mask = (PetscInt *)malloc(p_mask_size); 677 678 /* set mask to my PCTFS_my_id's bit mask */ 679 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 680 681 p_mask_size /= sizeof(PetscInt); 682 683 len_pair_list = gs->len_pw_list; 684 gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt)); 685 686 /* how many processors (nghs) do we have to exchange with? */ 687 nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 688 689 /* allocate space for PCTFS_gs_gop() info */ 690 gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 691 gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 692 gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1)); 693 694 /* init msg_size list */ 695 PetscCall(PCTFS_ivec_zero(msg_size, nprs)); 696 697 /* expand from bit mask list to int list */ 698 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list)); 699 700 /* keep list of elements being handled pairwise */ 701 for (i = j = 0; i < nel; i++) { 702 if (elms[i] & TOP_BIT) { 703 elms[i] ^= TOP_BIT; 704 pairwise_elm_list[j++] = i; 705 } 706 } 707 pairwise_elm_list[j] = -1; 708 709 gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 710 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 711 gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 712 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 713 gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz); 714 715 /* find who goes to each processor */ 716 for (i_start = i = 0; i < nprs; i++) { 717 /* processor i's mask */ 718 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i])); 719 720 /* det # going to processor i */ 721 for (ct = j = 0; j < len_pair_list; j++) { 722 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 723 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 724 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++; 725 } 726 msg_size[i] = ct; 727 i_start = PetscMax(i_start, ct); 728 729 /*space to hold nodes in message to first neighbor */ 730 msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1)); 731 732 for (j = 0; j < len_pair_list; j++) { 733 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 734 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 735 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j; 736 } 737 *iptr = -1; 738 } 739 msg_nodes[nprs] = NULL; 740 741 j = gs->loc_node_pairs = i_start; 742 t1 = GL_MAX; 743 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 744 gs->max_node_pairs = i_start; 745 746 i_start = j; 747 t1 = GL_MIN; 748 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 749 gs->min_node_pairs = i_start; 750 751 i_start = j; 752 t1 = GL_ADD; 753 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 754 gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1; 755 756 i_start = nprs; 757 t1 = GL_MAX; 758 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 759 gs->max_pairs = i_start; 760 761 /* remap pairwise in tail of gsi_via_bit_mask() */ 762 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs); 763 gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 764 gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 765 766 /* reset malloc pool */ 767 free((void *)p_mask); 768 free((void *)tmp_proc_mask); 769 PetscFunctionReturn(PETSC_SUCCESS); 770 } 771 772 /* to do pruned tree just save ngh buf copy for each one and decode here! 773 ******************************************************************************/ 774 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 775 { 776 PetscInt i, j, n, nel; 777 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 778 779 PetscFunctionBegin; 780 /* local work ptrs */ 781 elms = gs->elms; 782 nel = gs->nel; 783 784 /* how many via tree */ 785 gs->tree_nel = n = ntree; 786 gs->tree_elms = tree_elms = iptr_in = tree_buf; 787 gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 788 gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 789 j = gs->tree_map_sz; 790 gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 791 gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 792 793 /* search the longer of the two lists */ 794 /* note ... could save this info in get_ngh_buf and save searches */ 795 if (n <= nel) { 796 /* bijective fct w/remap - search elm list */ 797 for (i = 0; i < n; i++) { 798 if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) { 799 *iptr_in++ = j; 800 *iptr_out++ = i; 801 } 802 } 803 } else { 804 for (i = 0; i < nel; i++) { 805 if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) { 806 *iptr_in++ = i; 807 *iptr_out++ = j; 808 } 809 } 810 } 811 812 /* sentinel */ 813 *iptr_in = *iptr_out = -1; 814 PetscFunctionReturn(PETSC_SUCCESS); 815 } 816 817 /******************************************************************************/ 818 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 819 { 820 PetscInt *num, *map, **reduce; 821 PetscScalar tmp; 822 823 PetscFunctionBegin; 824 num = gs->num_gop_local_reduce; 825 reduce = gs->gop_local_reduce; 826 while ((map = *reduce++)) { 827 /* wall */ 828 if (*num == 2) { 829 num++; 830 vals[map[1]] = vals[map[0]]; 831 } else if (*num == 3) { /* corner shared by three elements */ 832 num++; 833 vals[map[2]] = vals[map[1]] = vals[map[0]]; 834 } else if (*num == 4) { /* corner shared by four elements */ 835 num++; 836 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 837 } else { /* general case ... odd geoms ... 3D*/ 838 num++; 839 tmp = *(vals + *map++); 840 while (*map >= 0) *(vals + *map++) = tmp; 841 } 842 } 843 PetscFunctionReturn(PETSC_SUCCESS); 844 } 845 846 /******************************************************************************/ 847 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 848 { 849 PetscInt *num, *map, **reduce; 850 PetscScalar tmp; 851 852 PetscFunctionBegin; 853 num = gs->num_local_reduce; 854 reduce = gs->local_reduce; 855 while ((map = *reduce)) { 856 /* wall */ 857 if (*num == 2) { 858 num++; 859 reduce++; 860 vals[map[1]] = vals[map[0]] += vals[map[1]]; 861 } else if (*num == 3) { /* corner shared by three elements */ 862 num++; 863 reduce++; 864 vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]); 865 } else if (*num == 4) { /* corner shared by four elements */ 866 num++; 867 reduce++; 868 vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 869 } else { /* general case ... odd geoms ... 3D*/ 870 num++; 871 tmp = 0.0; 872 while (*map >= 0) tmp += *(vals + *map++); 873 874 map = *reduce++; 875 while (*map >= 0) *(vals + *map++) = tmp; 876 } 877 } 878 PetscFunctionReturn(PETSC_SUCCESS); 879 } 880 881 /******************************************************************************/ 882 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 883 { 884 PetscInt *num, *map, **reduce; 885 PetscScalar *base; 886 887 PetscFunctionBegin; 888 num = gs->num_gop_local_reduce; 889 reduce = gs->gop_local_reduce; 890 while ((map = *reduce++)) { 891 /* wall */ 892 if (*num == 2) { 893 num++; 894 vals[map[0]] += vals[map[1]]; 895 } else if (*num == 3) { /* corner shared by three elements */ 896 num++; 897 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 898 } else if (*num == 4) { /* corner shared by four elements */ 899 num++; 900 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 901 } else { /* general case ... odd geoms ... 3D*/ 902 num++; 903 base = vals + *map++; 904 while (*map >= 0) *base += *(vals + *map++); 905 } 906 } 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 /******************************************************************************/ 911 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 912 { 913 PetscInt i; 914 915 PetscFunctionBegin; 916 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm)); 917 if (gs->nghs) free((void *)gs->nghs); 918 if (gs->pw_nghs) free((void *)gs->pw_nghs); 919 920 /* tree */ 921 if (gs->max_left_over) { 922 if (gs->tree_elms) free((void *)gs->tree_elms); 923 if (gs->tree_buf) free((void *)gs->tree_buf); 924 if (gs->tree_work) free((void *)gs->tree_work); 925 if (gs->tree_map_in) free((void *)gs->tree_map_in); 926 if (gs->tree_map_out) free((void *)gs->tree_map_out); 927 } 928 929 /* pairwise info */ 930 if (gs->num_pairs) { 931 /* should be NULL already */ 932 if (gs->ngh_buf) free((void *)gs->ngh_buf); 933 if (gs->elms) free((void *)gs->elms); 934 if (gs->local_elms) free((void *)gs->local_elms); 935 if (gs->companion) free((void *)gs->companion); 936 937 /* only set if pairwise */ 938 if (gs->vals) free((void *)gs->vals); 939 if (gs->in) free((void *)gs->in); 940 if (gs->out) free((void *)gs->out); 941 if (gs->msg_ids_in) free((void *)gs->msg_ids_in); 942 if (gs->msg_ids_out) free((void *)gs->msg_ids_out); 943 if (gs->pw_vals) free((void *)gs->pw_vals); 944 if (gs->pw_elm_list) free((void *)gs->pw_elm_list); 945 if (gs->node_list) { 946 for (i = 0; i < gs->num_pairs; i++) { 947 if (gs->node_list[i]) free((void *)gs->node_list[i]); 948 } 949 free((void *)gs->node_list); 950 } 951 if (gs->msg_sizes) free((void *)gs->msg_sizes); 952 if (gs->pair_list) free((void *)gs->pair_list); 953 } 954 955 /* local info */ 956 if (gs->num_local_total >= 0) { 957 for (i = 0; i < gs->num_local_total + 1; i++) { 958 if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]); 959 } 960 } 961 962 /* if intersection tree/pairwise and local isn't empty */ 963 if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce); 964 if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce); 965 966 free((void *)gs); 967 PetscFunctionReturn(PETSC_SUCCESS); 968 } 969 970 /******************************************************************************/ 971 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 972 { 973 PetscFunctionBegin; 974 switch (*op) { 975 case '+': 976 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 977 break; 978 default: 979 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0])); 980 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n")); 981 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 982 break; 983 } 984 PetscFunctionReturn(PETSC_SUCCESS); 985 } 986 987 /******************************************************************************/ 988 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 989 { 990 PetscFunctionBegin; 991 PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 992 993 /* local only operations!!! */ 994 if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step)); 995 996 /* if intersection tree/pairwise and local isn't empty */ 997 if (gs->num_local_gop) { 998 PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step)); 999 1000 /* pairwise */ 1001 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1002 1003 /* tree */ 1004 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1005 1006 PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step)); 1007 } else { /* if intersection tree/pairwise and local is empty */ 1008 /* pairwise */ 1009 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1010 1011 /* tree */ 1012 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1013 } 1014 PetscFunctionReturn(PETSC_SUCCESS); 1015 } 1016 1017 /******************************************************************************/ 1018 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1019 { 1020 PetscInt *num, *map, **reduce; 1021 PetscScalar *base; 1022 1023 PetscFunctionBegin; 1024 num = gs->num_local_reduce; 1025 reduce = gs->local_reduce; 1026 while ((map = *reduce)) { 1027 base = vals + map[0] * step; 1028 1029 /* wall */ 1030 if (*num == 2) { 1031 num++; 1032 reduce++; 1033 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1034 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1035 } else if (*num == 3) { /* corner shared by three elements */ 1036 num++; 1037 reduce++; 1038 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1039 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1040 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1041 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1042 } else if (*num == 4) { /* corner shared by four elements */ 1043 num++; 1044 reduce++; 1045 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1046 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1047 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1048 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1049 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1050 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1051 } else { /* general case ... odd geoms ... 3D */ 1052 num++; 1053 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1054 1055 map = *reduce; 1056 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1057 1058 reduce++; 1059 } 1060 } 1061 PetscFunctionReturn(PETSC_SUCCESS); 1062 } 1063 1064 /******************************************************************************/ 1065 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1066 { 1067 PetscInt *num, *map, **reduce; 1068 PetscScalar *base; 1069 1070 PetscFunctionBegin; 1071 num = gs->num_gop_local_reduce; 1072 reduce = gs->gop_local_reduce; 1073 while ((map = *reduce++)) { 1074 base = vals + map[0] * step; 1075 1076 /* wall */ 1077 if (*num == 2) { 1078 num++; 1079 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1080 } else if (*num == 3) { /* corner shared by three elements */ 1081 num++; 1082 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1083 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1084 } else if (*num == 4) { /* corner shared by four elements */ 1085 num++; 1086 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1087 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1088 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1089 } else { /* general case ... odd geoms ... 3D*/ 1090 num++; 1091 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1092 } 1093 } 1094 PetscFunctionReturn(PETSC_SUCCESS); 1095 } 1096 1097 /******************************************************************************/ 1098 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1099 { 1100 PetscInt *num, *map, **reduce; 1101 PetscScalar *base; 1102 1103 PetscFunctionBegin; 1104 num = gs->num_gop_local_reduce; 1105 reduce = gs->gop_local_reduce; 1106 while ((map = *reduce++)) { 1107 base = vals + map[0] * step; 1108 1109 /* wall */ 1110 if (*num == 2) { 1111 num++; 1112 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1113 } else if (*num == 3) { /* corner shared by three elements */ 1114 num++; 1115 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1116 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1117 } else if (*num == 4) { /* corner shared by four elements */ 1118 num++; 1119 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1120 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1121 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1122 } else { /* general case ... odd geoms ... 3D*/ 1123 num++; 1124 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1125 } 1126 } 1127 PetscFunctionReturn(PETSC_SUCCESS); 1128 } 1129 1130 /******************************************************************************/ 1131 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1132 { 1133 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1134 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1135 PetscInt *pw, *list, *size, **nodes; 1136 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1137 MPI_Status status; 1138 PetscBLASInt i1 = 1, dstep; 1139 1140 PetscFunctionBegin; 1141 /* strip and load s */ 1142 msg_list = list = gs->pair_list; 1143 msg_size = size = gs->msg_sizes; 1144 msg_nodes = nodes = gs->node_list; 1145 iptr = pw = gs->pw_elm_list; 1146 dptr1 = dptr3 = gs->pw_vals; 1147 msg_ids_in = ids_in = gs->msg_ids_in; 1148 msg_ids_out = ids_out = gs->msg_ids_out; 1149 dptr2 = gs->out; 1150 in1 = in2 = gs->in; 1151 1152 /* post the receives */ 1153 /* msg_nodes=nodes; */ 1154 do { 1155 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1156 second one *list and do list++ afterwards */ 1157 PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1158 list++; 1159 msg_ids_in++; 1160 in1 += *size++ * step; 1161 } while (*++msg_nodes); 1162 msg_nodes = nodes; 1163 1164 /* load gs values into in out gs buffers */ 1165 while (*iptr >= 0) { 1166 PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step)); 1167 dptr3 += step; 1168 iptr++; 1169 } 1170 1171 /* load out buffers and post the sends */ 1172 while ((iptr = *msg_nodes++)) { 1173 dptr3 = dptr2; 1174 while (*iptr >= 0) { 1175 PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step)); 1176 dptr2 += step; 1177 iptr++; 1178 } 1179 PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1180 msg_size++; 1181 msg_list++; 1182 msg_ids_out++; 1183 } 1184 1185 /* tree */ 1186 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step)); 1187 1188 /* process the received data */ 1189 msg_nodes = nodes; 1190 while ((iptr = *nodes++)) { 1191 PetscScalar d1 = 1.0; 1192 1193 /* Should I check the return value of MPI_Wait() or status? */ 1194 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1195 PetscCallMPI(MPI_Wait(ids_in, &status)); 1196 ids_in++; 1197 while (*iptr >= 0) { 1198 PetscCall(PetscBLASIntCast(step, &dstep)); 1199 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1)); 1200 in2 += step; 1201 iptr++; 1202 } 1203 } 1204 1205 /* replace vals */ 1206 while (*pw >= 0) { 1207 PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step)); 1208 dptr1 += step; 1209 pw++; 1210 } 1211 1212 /* clear isend message handles */ 1213 /* This changed for clarity though it could be the same */ 1214 1215 /* Should I check the return value of MPI_Wait() or status? */ 1216 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1217 while (*msg_nodes++) { 1218 PetscCallMPI(MPI_Wait(ids_out, &status)); 1219 ids_out++; 1220 } 1221 PetscFunctionReturn(PETSC_SUCCESS); 1222 } 1223 1224 /******************************************************************************/ 1225 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1226 { 1227 PetscInt size, *in, *out; 1228 PetscScalar *buf, *work; 1229 PetscInt op[] = {GL_ADD, 0}; 1230 PetscBLASInt i1 = 1; 1231 PetscBLASInt dstep; 1232 1233 PetscFunctionBegin; 1234 /* copy over to local variables */ 1235 in = gs->tree_map_in; 1236 out = gs->tree_map_out; 1237 buf = gs->tree_buf; 1238 work = gs->tree_work; 1239 size = gs->tree_nel * step; 1240 1241 /* zero out collection buffer */ 1242 PetscCall(PCTFS_rvec_zero(buf, size)); 1243 1244 /* copy over my contributions */ 1245 while (*in >= 0) { 1246 PetscCall(PetscBLASIntCast(step, &dstep)); 1247 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1)); 1248 } 1249 1250 /* perform fan in/out on full buffer */ 1251 /* must change PCTFS_grop to handle the blas */ 1252 PetscCall(PCTFS_grop(buf, work, size, op)); 1253 1254 /* reset */ 1255 in = gs->tree_map_in; 1256 out = gs->tree_map_out; 1257 1258 /* get the portion of the results I need */ 1259 while (*in >= 0) { 1260 PetscCall(PetscBLASIntCast(step, &dstep)); 1261 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1)); 1262 } 1263 PetscFunctionReturn(PETSC_SUCCESS); 1264 } 1265 1266 /******************************************************************************/ 1267 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1268 { 1269 PetscFunctionBegin; 1270 switch (*op) { 1271 case '+': 1272 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1273 break; 1274 default: 1275 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0])); 1276 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n")); 1277 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1278 break; 1279 } 1280 PetscFunctionReturn(PETSC_SUCCESS); 1281 } 1282 1283 /******************************************************************************/ 1284 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1285 { 1286 PetscFunctionBegin; 1287 /* if there's nothing to do return */ 1288 if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS); 1289 1290 /* can't do more dimensions then exist */ 1291 dim = PetscMin(dim, PCTFS_i_log2_num_nodes); 1292 1293 /* local only operations!!! */ 1294 if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals)); 1295 1296 /* if intersection tree/pairwise and local isn't empty */ 1297 if (gs->num_local_gop) { 1298 PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals)); 1299 1300 /* pairwise will do tree inside ... */ 1301 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */ 1302 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1303 1304 PetscCall(PCTFS_gs_gop_local_out(gs, vals)); 1305 } else { /* if intersection tree/pairwise and local is empty */ 1306 /* pairwise will do tree inside */ 1307 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */ 1308 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1309 } 1310 PetscFunctionReturn(PETSC_SUCCESS); 1311 } 1312 1313 /******************************************************************************/ 1314 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1315 { 1316 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1317 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1318 PetscInt *pw, *list, *size, **nodes; 1319 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1320 MPI_Status status; 1321 PetscInt i, mask = 1; 1322 1323 PetscFunctionBegin; 1324 for (i = 1; i < dim; i++) { 1325 mask <<= 1; 1326 mask++; 1327 } 1328 1329 /* strip and load s */ 1330 msg_list = list = gs->pair_list; 1331 msg_size = size = gs->msg_sizes; 1332 msg_nodes = nodes = gs->node_list; 1333 iptr = pw = gs->pw_elm_list; 1334 dptr1 = dptr3 = gs->pw_vals; 1335 msg_ids_in = ids_in = gs->msg_ids_in; 1336 msg_ids_out = ids_out = gs->msg_ids_out; 1337 dptr2 = gs->out; 1338 in1 = in2 = gs->in; 1339 1340 /* post the receives */ 1341 /* msg_nodes=nodes; */ 1342 do { 1343 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1344 second one *list and do list++ afterwards */ 1345 if ((PCTFS_my_id | mask) == (*list | mask)) { 1346 PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1347 list++; 1348 msg_ids_in++; 1349 in1 += *size++; 1350 } else { 1351 list++; 1352 size++; 1353 } 1354 } while (*++msg_nodes); 1355 1356 /* load gs values into in out gs buffers */ 1357 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1358 1359 /* load out buffers and post the sends */ 1360 msg_nodes = nodes; 1361 list = msg_list; 1362 while ((iptr = *msg_nodes++)) { 1363 if ((PCTFS_my_id | mask) == (*list | mask)) { 1364 dptr3 = dptr2; 1365 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1366 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1367 /* is msg_ids_out++ correct? */ 1368 PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1369 msg_size++; 1370 list++; 1371 msg_ids_out++; 1372 } else { 1373 list++; 1374 msg_size++; 1375 } 1376 } 1377 1378 /* do the tree while we're waiting */ 1379 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim)); 1380 1381 /* process the received data */ 1382 msg_nodes = nodes; 1383 list = msg_list; 1384 while ((iptr = *nodes++)) { 1385 if ((PCTFS_my_id | mask) == (*list | mask)) { 1386 /* Should I check the return value of MPI_Wait() or status? */ 1387 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1388 PetscCallMPI(MPI_Wait(ids_in, &status)); 1389 ids_in++; 1390 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1391 } 1392 list++; 1393 } 1394 1395 /* replace vals */ 1396 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1397 1398 /* clear isend message handles */ 1399 /* This changed for clarity though it could be the same */ 1400 while (*msg_nodes++) { 1401 if ((PCTFS_my_id | mask) == (*msg_list | mask)) { 1402 /* Should I check the return value of MPI_Wait() or status? */ 1403 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1404 PetscCallMPI(MPI_Wait(ids_out, &status)); 1405 ids_out++; 1406 } 1407 msg_list++; 1408 } 1409 PetscFunctionReturn(PETSC_SUCCESS); 1410 } 1411 1412 /******************************************************************************/ 1413 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1414 { 1415 PetscInt size; 1416 PetscInt *in, *out; 1417 PetscScalar *buf, *work; 1418 PetscInt op[] = {GL_ADD, 0}; 1419 1420 PetscFunctionBegin; 1421 in = gs->tree_map_in; 1422 out = gs->tree_map_out; 1423 buf = gs->tree_buf; 1424 work = gs->tree_work; 1425 size = gs->tree_nel; 1426 1427 PetscCall(PCTFS_rvec_zero(buf, size)); 1428 1429 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1430 1431 in = gs->tree_map_in; 1432 out = gs->tree_map_out; 1433 1434 PetscCall(PCTFS_grop_hc(buf, work, size, op, dim)); 1435 1436 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1437 PetscFunctionReturn(PETSC_SUCCESS); 1438 } 1439