1 /***********************************gs.c*************************************** 2 3 Author: Henry M. Tufo III 4 5 e-mail: hmt@cs.brown.edu 6 7 snail-mail: 8 Division of Applied Mathematics 9 Brown University 10 Providence, RI 02912 11 12 Last Modification: 13 6.21.97 14 ************************************gs.c**************************************/ 15 16 /***********************************gs.c*************************************** 17 File Description: 18 ----------------- 19 20 ************************************gs.c**************************************/ 21 22 #include <../src/ksp/pc/impls/tfs/tfs.h> 23 24 /* default length of number of items via tree - doubles if exceeded */ 25 #define TREE_BUF_SZ 2048 26 #define GS_VEC_SZ 1 27 28 /***********************************gs.c*************************************** 29 Type: struct gather_scatter_id 30 ------------------------------ 31 32 ************************************gs.c**************************************/ 33 typedef struct gather_scatter_id { 34 PetscInt id; 35 PetscInt nel_min; 36 PetscInt nel_max; 37 PetscInt nel_sum; 38 PetscInt negl; 39 PetscInt gl_max; 40 PetscInt gl_min; 41 PetscInt repeats; 42 PetscInt ordered; 43 PetscInt positive; 44 PetscScalar *vals; 45 46 /* bit mask info */ 47 PetscInt *my_proc_mask; 48 PetscInt mask_sz; 49 PetscInt *ngh_buf; 50 PetscInt ngh_buf_sz; 51 PetscInt *nghs; 52 PetscInt num_nghs; 53 PetscInt max_nghs; 54 PetscInt *pw_nghs; 55 PetscInt num_pw_nghs; 56 PetscInt *tree_nghs; 57 PetscInt num_tree_nghs; 58 59 PetscInt num_loads; 60 61 /* repeats == true -> local info */ 62 PetscInt nel; /* number of unique elements */ 63 PetscInt *elms; /* of size nel */ 64 PetscInt nel_total; 65 PetscInt *local_elms; /* of size nel_total */ 66 PetscInt *companion; /* of size nel_total */ 67 68 /* local info */ 69 PetscInt num_local_total; 70 PetscInt local_strength; 71 PetscInt num_local; 72 PetscInt *num_local_reduce; 73 PetscInt **local_reduce; 74 PetscInt num_local_gop; 75 PetscInt *num_gop_local_reduce; 76 PetscInt **gop_local_reduce; 77 78 /* pairwise info */ 79 PetscInt level; 80 PetscInt num_pairs; 81 PetscInt max_pairs; 82 PetscInt loc_node_pairs; 83 PetscInt max_node_pairs; 84 PetscInt min_node_pairs; 85 PetscInt avg_node_pairs; 86 PetscInt *pair_list; 87 PetscInt *msg_sizes; 88 PetscInt **node_list; 89 PetscInt len_pw_list; 90 PetscInt *pw_elm_list; 91 PetscScalar *pw_vals; 92 93 MPI_Request *msg_ids_in; 94 MPI_Request *msg_ids_out; 95 96 PetscScalar *out; 97 PetscScalar *in; 98 PetscInt msg_total; 99 100 /* tree - crystal accumulator info */ 101 PetscInt max_left_over; 102 PetscInt *pre; 103 PetscInt *in_num; 104 PetscInt *out_num; 105 PetscInt **in_list; 106 PetscInt **out_list; 107 108 /* new tree work*/ 109 PetscInt tree_nel; 110 PetscInt *tree_elms; 111 PetscScalar *tree_buf; 112 PetscScalar *tree_work; 113 114 PetscInt tree_map_sz; 115 PetscInt *tree_map_in; 116 PetscInt *tree_map_out; 117 118 /* current memory status */ 119 PetscInt gl_bss_min; 120 PetscInt gl_perm_min; 121 122 /* max segment size for PCTFS_gs_gop_vec() */ 123 PetscInt vec_sz; 124 125 /* hack to make paul happy */ 126 MPI_Comm PCTFS_gs_comm; 127 128 } PCTFS_gs_id; 129 130 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 131 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 132 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 133 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 134 static PCTFS_gs_id *gsi_new(void); 135 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 136 137 /* same for all but vector flavor */ 138 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 139 /* vector flavor */ 140 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 141 142 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 143 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 148 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 149 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 151 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 152 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 154 155 /* global vars */ 156 /* from comm.c module */ 157 158 static PetscInt num_gs_ids = 0; 159 160 /* should make this dynamic ... later */ 161 static PetscInt msg_buf = MAX_MSG_BUF; 162 static PetscInt vec_sz = GS_VEC_SZ; 163 static PetscInt *tree_buf = NULL; 164 static PetscInt tree_buf_sz = 0; 165 static PetscInt ntree = 0; 166 167 /***************************************************************************/ 168 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 169 { 170 PetscFunctionBegin; 171 vec_sz = size; 172 PetscFunctionReturn(PETSC_SUCCESS); 173 } 174 175 /******************************************************************************/ 176 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 177 { 178 PetscFunctionBegin; 179 msg_buf = buf_size; 180 PetscFunctionReturn(PETSC_SUCCESS); 181 } 182 183 /******************************************************************************/ 184 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 185 { 186 PCTFS_gs_id *gs; 187 MPI_Group PCTFS_gs_group; 188 MPI_Comm PCTFS_gs_comm; 189 190 /* ensure that communication package has been initialized */ 191 PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init()); 192 193 /* determines if we have enough dynamic/semi-static memory */ 194 /* checks input, allocs and sets gd_id template */ 195 gs = gsi_check_args(elms, nel, level); 196 197 /* only bit mask version up and working for the moment */ 198 /* LATER :: get int list version working for sparse pblms */ 199 PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs)); 200 201 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 202 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS); 203 PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 204 205 gs->PCTFS_gs_comm = PCTFS_gs_comm; 206 207 return gs; 208 } 209 210 /******************************************************************************/ 211 static PCTFS_gs_id *gsi_new(void) 212 { 213 PCTFS_gs_id *gs; 214 gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id)); 215 PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id))); 216 return gs; 217 } 218 219 /******************************************************************************/ 220 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 221 { 222 PetscInt i, j, k, t2; 223 PetscInt *companion, *elms, *unique, *iptr; 224 PetscInt num_local = 0, *num_to_reduce, **local_reduce; 225 PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND}; 226 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 227 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 228 PCTFS_gs_id *gs; 229 230 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n"); 231 if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n"); 232 233 if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n")); 234 235 /* get space for gs template */ 236 gs = gsi_new(); 237 gs->id = ++num_gs_ids; 238 239 /* hmt 6.4.99 */ 240 /* caller can set global ids that don't participate to 0 */ 241 /* PCTFS_gs_init ignores all zeros in elm list */ 242 /* negative global ids are still invalid */ 243 for (i = j = 0; i < nel; i++) { 244 if (in_elms[i] != 0) j++; 245 } 246 247 k = nel; 248 nel = j; 249 250 /* copy over in_elms list and create inverse map */ 251 elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt)); 252 companion = (PetscInt *)malloc(nel * sizeof(PetscInt)); 253 254 for (i = j = 0; i < k; i++) { 255 if (in_elms[i] != 0) { 256 elms[j] = in_elms[i]; 257 companion[j++] = i; 258 } 259 } 260 261 if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n"); 262 263 /* pre-pass ... check to see if sorted */ 264 elms[nel] = INT_MAX; 265 iptr = elms; 266 unique = elms + 1; 267 j = 0; 268 while (*iptr != INT_MAX) { 269 if (*iptr++ > *unique++) { 270 j = 1; 271 break; 272 } 273 } 274 275 /* set up inverse map */ 276 if (j) { 277 PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n")); 278 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER)); 279 } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n")); 280 elms[nel] = INT_MIN; 281 282 /* first pass */ 283 /* determine number of unique elements, check pd */ 284 for (i = k = 0; i < nel; i += j) { 285 t2 = elms[i]; 286 j = ++i; 287 288 /* clump 'em for now */ 289 while (elms[j] == t2) j++; 290 291 /* how many together and num local */ 292 if (j -= i) { 293 num_local++; 294 k += j; 295 } 296 } 297 298 /* how many unique elements? */ 299 gs->repeats = k; 300 gs->nel = nel - k; 301 302 /* number of repeats? */ 303 gs->num_local = num_local; 304 num_local += 2; 305 gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *)); 306 gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt)); 307 308 unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt)); 309 gs->elms = unique; 310 gs->nel_total = nel; 311 gs->local_elms = elms; 312 gs->companion = companion; 313 314 /* compess map as well as keep track of local ops */ 315 for (num_local = i = j = 0; i < gs->nel; i++) { 316 k = j; 317 t2 = unique[i] = elms[j]; 318 companion[i] = companion[j]; 319 320 while (elms[j] == t2) j++; 321 322 if ((t2 = (j - k)) > 1) { 323 /* number together */ 324 num_to_reduce[num_local] = t2++; 325 326 iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt)); 327 328 /* to use binary searching don't remap until we check intersection */ 329 *iptr++ = i; 330 331 /* note that we're skipping the first one */ 332 while (++k < j) *(iptr++) = companion[k]; 333 *iptr = -1; 334 } 335 } 336 337 /* sentinel for ngh_buf */ 338 unique[gs->nel] = INT_MAX; 339 340 /* for two partition sort hack */ 341 num_to_reduce[num_local] = 0; 342 local_reduce[num_local] = NULL; 343 num_to_reduce[++num_local] = 0; 344 local_reduce[num_local] = NULL; 345 346 /* load 'em up */ 347 /* note one extra to hold NON_UNIFORM flag!!! */ 348 vals[2] = vals[1] = vals[0] = nel; 349 if (gs->nel > 0) { 350 vals[3] = unique[0]; 351 vals[4] = unique[gs->nel - 1]; 352 } else { 353 vals[3] = INT_MAX; 354 vals[4] = INT_MIN; 355 } 356 vals[5] = level; 357 vals[6] = num_gs_ids; 358 359 /* GLOBAL: send 'em out */ 360 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs)); 361 362 /* must be semi-pos def - only pairwise depends on this */ 363 /* LATER - remove this restriction */ 364 if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n"); 365 if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n"); 366 367 gs->nel_min = vals[0]; 368 gs->nel_max = vals[1]; 369 gs->nel_sum = vals[2]; 370 gs->gl_min = vals[3]; 371 gs->gl_max = vals[4]; 372 gs->negl = vals[4] - vals[3] + 1; 373 374 if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl); 375 376 /* LATER :: add level == -1 -> program selects level */ 377 if (vals[5] < 0) vals[5] = 0; 378 else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes; 379 gs->level = vals[5]; 380 381 return gs; 382 } 383 384 /******************************************************************************/ 385 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 386 { 387 PetscInt i, nel, *elms; 388 PetscInt t1; 389 PetscInt **reduce; 390 PetscInt *map; 391 392 PetscFunctionBegin; 393 /* totally local removes ... PCTFS_ct_bits == 0 */ 394 PetscCall(get_ngh_buf(gs)); 395 396 if (gs->level) PetscCall(set_pairwise(gs)); 397 if (gs->max_left_over) PetscCall(set_tree(gs)); 398 399 /* intersection local and pairwise/tree? */ 400 gs->num_local_total = gs->num_local; 401 gs->gop_local_reduce = gs->local_reduce; 402 gs->num_gop_local_reduce = gs->num_local_reduce; 403 404 map = gs->companion; 405 406 /* is there any local compression */ 407 if (!gs->num_local) { 408 gs->local_strength = NONE; 409 gs->num_local_gop = 0; 410 } else { 411 /* ok find intersection */ 412 map = gs->companion; 413 reduce = gs->local_reduce; 414 for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) { 415 if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) { 416 t1++; 417 PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?"); 418 gs->num_local_reduce[i] *= -1; 419 } 420 **reduce = map[**reduce]; 421 } 422 423 /* intersection is empty */ 424 if (!t1) { 425 gs->local_strength = FULL; 426 gs->num_local_gop = 0; 427 } else { /* intersection not empty */ 428 gs->local_strength = PARTIAL; 429 430 PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR)); 431 432 gs->num_local_gop = t1; 433 gs->num_local_total = gs->num_local; 434 gs->num_local -= t1; 435 gs->gop_local_reduce = gs->local_reduce; 436 gs->num_gop_local_reduce = gs->num_local_reduce; 437 438 for (i = 0; i < t1; i++) { 439 PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?"); 440 gs->num_gop_local_reduce[i] *= -1; 441 gs->local_reduce++; 442 gs->num_local_reduce++; 443 } 444 gs->local_reduce++; 445 gs->num_local_reduce++; 446 } 447 } 448 449 elms = gs->pw_elm_list; 450 nel = gs->len_pw_list; 451 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 452 453 elms = gs->tree_map_in; 454 nel = gs->tree_map_sz; 455 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 456 457 /* clean up */ 458 free((void *)gs->local_elms); 459 free((void *)gs->companion); 460 free((void *)gs->elms); 461 free((void *)gs->ngh_buf); 462 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465 466 /******************************************************************************/ 467 static PetscErrorCode place_in_tree(PetscInt elm) 468 { 469 PetscInt *tp, n; 470 471 PetscFunctionBegin; 472 if (ntree == tree_buf_sz) { 473 if (tree_buf_sz) { 474 tp = tree_buf; 475 n = tree_buf_sz; 476 tree_buf_sz <<= 1; 477 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 478 PCTFS_ivec_copy(tree_buf, tp, n); 479 free(tp); 480 } else { 481 tree_buf_sz = TREE_BUF_SZ; 482 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 483 } 484 } 485 486 tree_buf[ntree++] = elm; 487 PetscFunctionReturn(PETSC_SUCCESS); 488 } 489 490 /******************************************************************************/ 491 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 492 { 493 PetscInt i, j, npw = 0, ntree_map = 0; 494 PetscInt p_mask_size, ngh_buf_size, buf_size; 495 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 496 PetscInt *ngh_buf, *buf1, *buf2; 497 PetscInt offset, per_load, num_loads, or_ct, start, end; 498 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 499 PetscInt oper = GL_B_OR; 500 PetscInt *ptr3, *t_mask, level, ct1, ct2; 501 502 PetscFunctionBegin; 503 /* to make life easier */ 504 nel = gs->nel; 505 elms = gs->elms; 506 level = gs->level; 507 508 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 509 p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes)); 510 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 511 512 /* allocate space for masks and info bufs */ 513 gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size); 514 gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size); 515 gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel; 516 t_mask = (PetscInt *)malloc(p_mask_size); 517 gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size); 518 519 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 520 /* had thought I could exploit rendezvous threshold */ 521 522 /* default is one pass */ 523 per_load = negl = gs->negl; 524 gs->num_loads = num_loads = 1; 525 i = p_mask_size * negl; 526 527 /* possible overflow on buffer size */ 528 /* overflow hack */ 529 if (i < 0) i = INT_MAX; 530 531 buf_size = PetscMin(msg_buf, i); 532 533 /* can we do it? */ 534 PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size); 535 536 /* get PCTFS_giop buf space ... make *only* one malloc */ 537 buf1 = (PetscInt *)malloc(buf_size << 1); 538 539 /* more than one gior exchange needed? */ 540 if (buf_size != i) { 541 per_load = buf_size / p_mask_size; 542 buf_size = per_load * p_mask_size; 543 gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0); 544 } 545 546 /* convert buf sizes from #bytes to #ints - 32-bit only! */ 547 p_mask_size /= sizeof(PetscInt); 548 ngh_buf_size /= sizeof(PetscInt); 549 buf_size /= sizeof(PetscInt); 550 551 /* find PCTFS_giop work space */ 552 buf2 = buf1 + buf_size; 553 554 /* hold #ints needed for processor masks */ 555 gs->mask_sz = p_mask_size; 556 557 /* init buffers */ 558 PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size)); 559 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size)); 560 PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size)); 561 562 /* HACK reset tree info */ 563 tree_buf = NULL; 564 tree_buf_sz = ntree = 0; 565 566 /* ok do it */ 567 for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) { 568 /* identity for bitwise or is 000...000 */ 569 PetscCall(PCTFS_ivec_zero(buf1, buf_size)); 570 571 /* load msg buffer */ 572 for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) { 573 offset = (offset - start) * p_mask_size; 574 PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size); 575 } 576 577 /* GLOBAL: pass buffer */ 578 PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper)); 579 580 /* unload buffer into ngh_buf */ 581 ptr2 = (elms + i_start); 582 for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) { 583 /* I own it ... may have to pairwise it */ 584 if (j == *ptr2) { 585 /* do i share it w/anyone? */ 586 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 587 /* guess not */ 588 if (ct1 < 2) { 589 ptr2++; 590 ptr1 += p_mask_size; 591 continue; 592 } 593 594 /* i do ... so keep info and turn off my bit */ 595 PCTFS_ivec_copy(ptr1, ptr3, p_mask_size); 596 PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size)); 597 PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size)); 598 599 /* is it to be done pairwise? */ 600 if (--ct1 <= level) { 601 npw++; 602 603 /* turn on high bit to indicate pw need to process */ 604 *ptr2++ |= TOP_BIT; 605 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size)); 606 ptr1 += p_mask_size; 607 continue; 608 } 609 610 /* get set for next and note that I have a tree contribution */ 611 /* could save exact elm index for tree here -> save a search */ 612 ptr2++; 613 ptr1 += p_mask_size; 614 ntree_map++; 615 } else { /* i don't but still might be involved in tree */ 616 617 /* shared by how many? */ 618 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 619 620 /* none! */ 621 if (ct1 < 2) continue; 622 623 /* is it going to be done pairwise? but not by me of course!*/ 624 if (--ct1 <= level) continue; 625 } 626 /* LATER we're going to have to process it NOW */ 627 /* nope ... tree it */ 628 PetscCall(place_in_tree(j)); 629 } 630 } 631 632 free((void *)t_mask); 633 free((void *)buf1); 634 635 gs->len_pw_list = npw; 636 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 637 638 /* expand from bit mask list to int list and save ngh list */ 639 gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt)); 640 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs)); 641 642 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt)); 643 644 oper = GL_MAX; 645 ct1 = gs->num_nghs; 646 PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper)); 647 gs->max_nghs = ct1; 648 649 gs->tree_map_sz = ntree_map; 650 gs->max_left_over = ntree; 651 652 free((void *)p_mask); 653 free((void *)sh_proc_mask); 654 PetscFunctionReturn(PETSC_SUCCESS); 655 } 656 657 /******************************************************************************/ 658 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 659 { 660 PetscInt i, j; 661 PetscInt p_mask_size; 662 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 663 PetscInt *ngh_buf, *buf2; 664 PetscInt offset; 665 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 666 PetscInt *pairwise_elm_list, len_pair_list = 0; 667 PetscInt *iptr, t1, i_start, nel, *elms; 668 PetscInt ct; 669 670 PetscFunctionBegin; 671 /* to make life easier */ 672 nel = gs->nel; 673 elms = gs->elms; 674 ngh_buf = gs->ngh_buf; 675 sh_proc_mask = gs->pw_nghs; 676 677 /* need a few temp masks */ 678 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 679 p_mask = (PetscInt *)malloc(p_mask_size); 680 tmp_proc_mask = (PetscInt *)malloc(p_mask_size); 681 682 /* set mask to my PCTFS_my_id's bit mask */ 683 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 684 685 p_mask_size /= sizeof(PetscInt); 686 687 len_pair_list = gs->len_pw_list; 688 gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt)); 689 690 /* how many processors (nghs) do we have to exchange with? */ 691 nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 692 693 /* allocate space for PCTFS_gs_gop() info */ 694 gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 695 gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 696 gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1)); 697 698 /* init msg_size list */ 699 PetscCall(PCTFS_ivec_zero(msg_size, nprs)); 700 701 /* expand from bit mask list to int list */ 702 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list)); 703 704 /* keep list of elements being handled pairwise */ 705 for (i = j = 0; i < nel; i++) { 706 if (elms[i] & TOP_BIT) { 707 elms[i] ^= TOP_BIT; 708 pairwise_elm_list[j++] = i; 709 } 710 } 711 pairwise_elm_list[j] = -1; 712 713 gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 714 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 715 gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 716 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 717 gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz); 718 719 /* find who goes to each processor */ 720 for (i_start = i = 0; i < nprs; i++) { 721 /* processor i's mask */ 722 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i])); 723 724 /* det # going to processor i */ 725 for (ct = j = 0; j < len_pair_list; j++) { 726 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 727 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 728 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++; 729 } 730 msg_size[i] = ct; 731 i_start = PetscMax(i_start, ct); 732 733 /*space to hold nodes in message to first neighbor */ 734 msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1)); 735 736 for (j = 0; j < len_pair_list; j++) { 737 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 738 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 739 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j; 740 } 741 *iptr = -1; 742 } 743 msg_nodes[nprs] = NULL; 744 745 j = gs->loc_node_pairs = i_start; 746 t1 = GL_MAX; 747 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 748 gs->max_node_pairs = i_start; 749 750 i_start = j; 751 t1 = GL_MIN; 752 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 753 gs->min_node_pairs = i_start; 754 755 i_start = j; 756 t1 = GL_ADD; 757 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 758 gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1; 759 760 i_start = nprs; 761 t1 = GL_MAX; 762 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 763 gs->max_pairs = i_start; 764 765 /* remap pairwise in tail of gsi_via_bit_mask() */ 766 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs); 767 gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 768 gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 769 770 /* reset malloc pool */ 771 free((void *)p_mask); 772 free((void *)tmp_proc_mask); 773 PetscFunctionReturn(PETSC_SUCCESS); 774 } 775 776 /* to do pruned tree just save ngh buf copy for each one and decode here! 777 ******************************************************************************/ 778 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 779 { 780 PetscInt i, j, n, nel; 781 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 782 783 PetscFunctionBegin; 784 /* local work ptrs */ 785 elms = gs->elms; 786 nel = gs->nel; 787 788 /* how many via tree */ 789 gs->tree_nel = n = ntree; 790 gs->tree_elms = tree_elms = iptr_in = tree_buf; 791 gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 792 gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 793 j = gs->tree_map_sz; 794 gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 795 gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 796 797 /* search the longer of the two lists */ 798 /* note ... could save this info in get_ngh_buf and save searches */ 799 if (n <= nel) { 800 /* bijective fct w/remap - search elm list */ 801 for (i = 0; i < n; i++) { 802 if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) { 803 *iptr_in++ = j; 804 *iptr_out++ = i; 805 } 806 } 807 } else { 808 for (i = 0; i < nel; i++) { 809 if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) { 810 *iptr_in++ = i; 811 *iptr_out++ = j; 812 } 813 } 814 } 815 816 /* sentinel */ 817 *iptr_in = *iptr_out = -1; 818 PetscFunctionReturn(PETSC_SUCCESS); 819 } 820 821 /******************************************************************************/ 822 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 823 { 824 PetscInt *num, *map, **reduce; 825 PetscScalar tmp; 826 827 PetscFunctionBegin; 828 num = gs->num_gop_local_reduce; 829 reduce = gs->gop_local_reduce; 830 while ((map = *reduce++)) { 831 /* wall */ 832 if (*num == 2) { 833 num++; 834 vals[map[1]] = vals[map[0]]; 835 } else if (*num == 3) { /* corner shared by three elements */ 836 num++; 837 vals[map[2]] = vals[map[1]] = vals[map[0]]; 838 } else if (*num == 4) { /* corner shared by four elements */ 839 num++; 840 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 841 } else { /* general case ... odd geoms ... 3D*/ 842 num++; 843 tmp = *(vals + *map++); 844 while (*map >= 0) *(vals + *map++) = tmp; 845 } 846 } 847 PetscFunctionReturn(PETSC_SUCCESS); 848 } 849 850 /******************************************************************************/ 851 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 852 { 853 PetscInt *num, *map, **reduce; 854 PetscScalar tmp; 855 856 PetscFunctionBegin; 857 num = gs->num_local_reduce; 858 reduce = gs->local_reduce; 859 while ((map = *reduce)) { 860 /* wall */ 861 if (*num == 2) { 862 num++; 863 reduce++; 864 vals[map[1]] = vals[map[0]] += vals[map[1]]; 865 } else if (*num == 3) { /* corner shared by three elements */ 866 num++; 867 reduce++; 868 vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]); 869 } else if (*num == 4) { /* corner shared by four elements */ 870 num++; 871 reduce++; 872 vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 873 } else { /* general case ... odd geoms ... 3D*/ 874 num++; 875 tmp = 0.0; 876 while (*map >= 0) tmp += *(vals + *map++); 877 878 map = *reduce++; 879 while (*map >= 0) *(vals + *map++) = tmp; 880 } 881 } 882 PetscFunctionReturn(PETSC_SUCCESS); 883 } 884 885 /******************************************************************************/ 886 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 887 { 888 PetscInt *num, *map, **reduce; 889 PetscScalar *base; 890 891 PetscFunctionBegin; 892 num = gs->num_gop_local_reduce; 893 reduce = gs->gop_local_reduce; 894 while ((map = *reduce++)) { 895 /* wall */ 896 if (*num == 2) { 897 num++; 898 vals[map[0]] += vals[map[1]]; 899 } else if (*num == 3) { /* corner shared by three elements */ 900 num++; 901 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 902 } else if (*num == 4) { /* corner shared by four elements */ 903 num++; 904 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 905 } else { /* general case ... odd geoms ... 3D*/ 906 num++; 907 base = vals + *map++; 908 while (*map >= 0) *base += *(vals + *map++); 909 } 910 } 911 PetscFunctionReturn(PETSC_SUCCESS); 912 } 913 914 /******************************************************************************/ 915 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 916 { 917 PetscInt i; 918 919 PetscFunctionBegin; 920 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm)); 921 if (gs->nghs) free((void *)gs->nghs); 922 if (gs->pw_nghs) free((void *)gs->pw_nghs); 923 924 /* tree */ 925 if (gs->max_left_over) { 926 if (gs->tree_elms) free((void *)gs->tree_elms); 927 if (gs->tree_buf) free((void *)gs->tree_buf); 928 if (gs->tree_work) free((void *)gs->tree_work); 929 if (gs->tree_map_in) free((void *)gs->tree_map_in); 930 if (gs->tree_map_out) free((void *)gs->tree_map_out); 931 } 932 933 /* pairwise info */ 934 if (gs->num_pairs) { 935 /* should be NULL already */ 936 if (gs->ngh_buf) free((void *)gs->ngh_buf); 937 if (gs->elms) free((void *)gs->elms); 938 if (gs->local_elms) free((void *)gs->local_elms); 939 if (gs->companion) free((void *)gs->companion); 940 941 /* only set if pairwise */ 942 if (gs->vals) free((void *)gs->vals); 943 if (gs->in) free((void *)gs->in); 944 if (gs->out) free((void *)gs->out); 945 if (gs->msg_ids_in) free((void *)gs->msg_ids_in); 946 if (gs->msg_ids_out) free((void *)gs->msg_ids_out); 947 if (gs->pw_vals) free((void *)gs->pw_vals); 948 if (gs->pw_elm_list) free((void *)gs->pw_elm_list); 949 if (gs->node_list) { 950 for (i = 0; i < gs->num_pairs; i++) { 951 if (gs->node_list[i]) free((void *)gs->node_list[i]); 952 } 953 free((void *)gs->node_list); 954 } 955 if (gs->msg_sizes) free((void *)gs->msg_sizes); 956 if (gs->pair_list) free((void *)gs->pair_list); 957 } 958 959 /* local info */ 960 if (gs->num_local_total >= 0) { 961 for (i = 0; i < gs->num_local_total + 1; i++) { 962 if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]); 963 } 964 } 965 966 /* if intersection tree/pairwise and local isn't empty */ 967 if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce); 968 if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce); 969 970 free((void *)gs); 971 PetscFunctionReturn(PETSC_SUCCESS); 972 } 973 974 /******************************************************************************/ 975 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 976 { 977 PetscFunctionBegin; 978 switch (*op) { 979 case '+': 980 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 981 break; 982 default: 983 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0])); 984 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n")); 985 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 986 break; 987 } 988 PetscFunctionReturn(PETSC_SUCCESS); 989 } 990 991 /******************************************************************************/ 992 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 993 { 994 PetscFunctionBegin; 995 PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 996 997 /* local only operations!!! */ 998 if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step)); 999 1000 /* if intersection tree/pairwise and local isn't empty */ 1001 if (gs->num_local_gop) { 1002 PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step)); 1003 1004 /* pairwise */ 1005 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1006 1007 /* tree */ 1008 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1009 1010 PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step)); 1011 } else { /* if intersection tree/pairwise and local is empty */ 1012 /* pairwise */ 1013 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1014 1015 /* tree */ 1016 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1017 } 1018 PetscFunctionReturn(PETSC_SUCCESS); 1019 } 1020 1021 /******************************************************************************/ 1022 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1023 { 1024 PetscInt *num, *map, **reduce; 1025 PetscScalar *base; 1026 1027 PetscFunctionBegin; 1028 num = gs->num_local_reduce; 1029 reduce = gs->local_reduce; 1030 while ((map = *reduce)) { 1031 base = vals + map[0] * step; 1032 1033 /* wall */ 1034 if (*num == 2) { 1035 num++; 1036 reduce++; 1037 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1038 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1039 } else if (*num == 3) { /* corner shared by three elements */ 1040 num++; 1041 reduce++; 1042 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1043 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1044 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1045 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1046 } else if (*num == 4) { /* corner shared by four elements */ 1047 num++; 1048 reduce++; 1049 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1050 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1051 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1052 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1053 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1054 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1055 } else { /* general case ... odd geoms ... 3D */ 1056 num++; 1057 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1058 1059 map = *reduce; 1060 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1061 1062 reduce++; 1063 } 1064 } 1065 PetscFunctionReturn(PETSC_SUCCESS); 1066 } 1067 1068 /******************************************************************************/ 1069 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1070 { 1071 PetscInt *num, *map, **reduce; 1072 PetscScalar *base; 1073 1074 PetscFunctionBegin; 1075 num = gs->num_gop_local_reduce; 1076 reduce = gs->gop_local_reduce; 1077 while ((map = *reduce++)) { 1078 base = vals + map[0] * step; 1079 1080 /* wall */ 1081 if (*num == 2) { 1082 num++; 1083 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1084 } else if (*num == 3) { /* corner shared by three elements */ 1085 num++; 1086 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1087 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1088 } else if (*num == 4) { /* corner shared by four elements */ 1089 num++; 1090 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1091 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1092 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1093 } else { /* general case ... odd geoms ... 3D*/ 1094 num++; 1095 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1096 } 1097 } 1098 PetscFunctionReturn(PETSC_SUCCESS); 1099 } 1100 1101 /******************************************************************************/ 1102 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1103 { 1104 PetscInt *num, *map, **reduce; 1105 PetscScalar *base; 1106 1107 PetscFunctionBegin; 1108 num = gs->num_gop_local_reduce; 1109 reduce = gs->gop_local_reduce; 1110 while ((map = *reduce++)) { 1111 base = vals + map[0] * step; 1112 1113 /* wall */ 1114 if (*num == 2) { 1115 num++; 1116 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1117 } else if (*num == 3) { /* corner shared by three elements */ 1118 num++; 1119 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1120 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1121 } else if (*num == 4) { /* corner shared by four elements */ 1122 num++; 1123 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1124 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1125 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1126 } else { /* general case ... odd geoms ... 3D*/ 1127 num++; 1128 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1129 } 1130 } 1131 PetscFunctionReturn(PETSC_SUCCESS); 1132 } 1133 1134 /******************************************************************************/ 1135 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1136 { 1137 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1138 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1139 PetscInt *pw, *list, *size, **nodes; 1140 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1141 MPI_Status status; 1142 PetscBLASInt i1 = 1, dstep; 1143 1144 PetscFunctionBegin; 1145 /* strip and load s */ 1146 msg_list = list = gs->pair_list; 1147 msg_size = size = gs->msg_sizes; 1148 msg_nodes = nodes = gs->node_list; 1149 iptr = pw = gs->pw_elm_list; 1150 dptr1 = dptr3 = gs->pw_vals; 1151 msg_ids_in = ids_in = gs->msg_ids_in; 1152 msg_ids_out = ids_out = gs->msg_ids_out; 1153 dptr2 = gs->out; 1154 in1 = in2 = gs->in; 1155 1156 /* post the receives */ 1157 /* msg_nodes=nodes; */ 1158 do { 1159 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1160 second one *list and do list++ afterwards */ 1161 PetscCallMPI(MPIU_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1162 list++; 1163 msg_ids_in++; 1164 in1 += *size++ * step; 1165 } while (*++msg_nodes); 1166 msg_nodes = nodes; 1167 1168 /* load gs values into in out gs buffers */ 1169 while (*iptr >= 0) { 1170 PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step)); 1171 dptr3 += step; 1172 iptr++; 1173 } 1174 1175 /* load out buffers and post the sends */ 1176 while ((iptr = *msg_nodes++)) { 1177 dptr3 = dptr2; 1178 while (*iptr >= 0) { 1179 PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step)); 1180 dptr2 += step; 1181 iptr++; 1182 } 1183 PetscCallMPI(MPIU_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1184 msg_size++; 1185 msg_list++; 1186 msg_ids_out++; 1187 } 1188 1189 /* tree */ 1190 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step)); 1191 1192 /* process the received data */ 1193 msg_nodes = nodes; 1194 while ((iptr = *nodes++)) { 1195 PetscScalar d1 = 1.0; 1196 1197 /* Should I check the return value of MPI_Wait() or status? */ 1198 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1199 PetscCallMPI(MPI_Wait(ids_in, &status)); 1200 ids_in++; 1201 while (*iptr >= 0) { 1202 PetscCall(PetscBLASIntCast(step, &dstep)); 1203 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1)); 1204 in2 += step; 1205 iptr++; 1206 } 1207 } 1208 1209 /* replace vals */ 1210 while (*pw >= 0) { 1211 PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step)); 1212 dptr1 += step; 1213 pw++; 1214 } 1215 1216 /* clear isend message handles */ 1217 /* This changed for clarity though it could be the same */ 1218 1219 /* Should I check the return value of MPI_Wait() or status? */ 1220 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1221 while (*msg_nodes++) { 1222 PetscCallMPI(MPI_Wait(ids_out, &status)); 1223 ids_out++; 1224 } 1225 PetscFunctionReturn(PETSC_SUCCESS); 1226 } 1227 1228 /******************************************************************************/ 1229 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1230 { 1231 PetscInt size, *in, *out; 1232 PetscScalar *buf, *work; 1233 PetscInt op[] = {GL_ADD, 0}; 1234 PetscBLASInt i1 = 1; 1235 PetscBLASInt dstep; 1236 1237 PetscFunctionBegin; 1238 /* copy over to local variables */ 1239 in = gs->tree_map_in; 1240 out = gs->tree_map_out; 1241 buf = gs->tree_buf; 1242 work = gs->tree_work; 1243 size = gs->tree_nel * step; 1244 1245 /* zero out collection buffer */ 1246 PetscCall(PCTFS_rvec_zero(buf, size)); 1247 1248 /* copy over my contributions */ 1249 while (*in >= 0) { 1250 PetscCall(PetscBLASIntCast(step, &dstep)); 1251 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1)); 1252 } 1253 1254 /* perform fan in/out on full buffer */ 1255 /* must change PCTFS_grop to handle the blas */ 1256 PetscCall(PCTFS_grop(buf, work, size, op)); 1257 1258 /* reset */ 1259 in = gs->tree_map_in; 1260 out = gs->tree_map_out; 1261 1262 /* get the portion of the results I need */ 1263 while (*in >= 0) { 1264 PetscCall(PetscBLASIntCast(step, &dstep)); 1265 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1)); 1266 } 1267 PetscFunctionReturn(PETSC_SUCCESS); 1268 } 1269 1270 /******************************************************************************/ 1271 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1272 { 1273 PetscFunctionBegin; 1274 switch (*op) { 1275 case '+': 1276 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1277 break; 1278 default: 1279 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0])); 1280 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n")); 1281 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1282 break; 1283 } 1284 PetscFunctionReturn(PETSC_SUCCESS); 1285 } 1286 1287 /******************************************************************************/ 1288 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1289 { 1290 PetscFunctionBegin; 1291 /* if there's nothing to do return */ 1292 if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS); 1293 1294 /* can't do more dimensions then exist */ 1295 dim = PetscMin(dim, PCTFS_i_log2_num_nodes); 1296 1297 /* local only operations!!! */ 1298 if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals)); 1299 1300 /* if intersection tree/pairwise and local isn't empty */ 1301 if (gs->num_local_gop) { 1302 PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals)); 1303 1304 /* pairwise will do tree inside ... */ 1305 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */ 1306 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1307 1308 PetscCall(PCTFS_gs_gop_local_out(gs, vals)); 1309 } else { /* if intersection tree/pairwise and local is empty */ 1310 /* pairwise will do tree inside */ 1311 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */ 1312 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1313 } 1314 PetscFunctionReturn(PETSC_SUCCESS); 1315 } 1316 1317 /******************************************************************************/ 1318 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1319 { 1320 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1321 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1322 PetscInt *pw, *list, *size, **nodes; 1323 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1324 MPI_Status status; 1325 PetscInt i, mask = 1; 1326 1327 PetscFunctionBegin; 1328 for (i = 1; i < dim; i++) { 1329 mask <<= 1; 1330 mask++; 1331 } 1332 1333 /* strip and load s */ 1334 msg_list = list = gs->pair_list; 1335 msg_size = size = gs->msg_sizes; 1336 msg_nodes = nodes = gs->node_list; 1337 iptr = pw = gs->pw_elm_list; 1338 dptr1 = dptr3 = gs->pw_vals; 1339 msg_ids_in = ids_in = gs->msg_ids_in; 1340 msg_ids_out = ids_out = gs->msg_ids_out; 1341 dptr2 = gs->out; 1342 in1 = in2 = gs->in; 1343 1344 /* post the receives */ 1345 /* msg_nodes=nodes; */ 1346 do { 1347 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1348 second one *list and do list++ afterwards */ 1349 if ((PCTFS_my_id | mask) == (*list | mask)) { 1350 PetscCallMPI(MPIU_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1351 list++; 1352 msg_ids_in++; 1353 in1 += *size++; 1354 } else { 1355 list++; 1356 size++; 1357 } 1358 } while (*++msg_nodes); 1359 1360 /* load gs values into in out gs buffers */ 1361 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1362 1363 /* load out buffers and post the sends */ 1364 msg_nodes = nodes; 1365 list = msg_list; 1366 while ((iptr = *msg_nodes++)) { 1367 if ((PCTFS_my_id | mask) == (*list | mask)) { 1368 dptr3 = dptr2; 1369 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1370 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1371 /* is msg_ids_out++ correct? */ 1372 PetscCallMPI(MPIU_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1373 msg_size++; 1374 list++; 1375 msg_ids_out++; 1376 } else { 1377 list++; 1378 msg_size++; 1379 } 1380 } 1381 1382 /* do the tree while we're waiting */ 1383 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim)); 1384 1385 /* process the received data */ 1386 msg_nodes = nodes; 1387 list = msg_list; 1388 while ((iptr = *nodes++)) { 1389 if ((PCTFS_my_id | mask) == (*list | mask)) { 1390 /* Should I check the return value of MPI_Wait() or status? */ 1391 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1392 PetscCallMPI(MPI_Wait(ids_in, &status)); 1393 ids_in++; 1394 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1395 } 1396 list++; 1397 } 1398 1399 /* replace vals */ 1400 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1401 1402 /* clear isend message handles */ 1403 /* This changed for clarity though it could be the same */ 1404 while (*msg_nodes++) { 1405 if ((PCTFS_my_id | mask) == (*msg_list | mask)) { 1406 /* Should I check the return value of MPI_Wait() or status? */ 1407 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1408 PetscCallMPI(MPI_Wait(ids_out, &status)); 1409 ids_out++; 1410 } 1411 msg_list++; 1412 } 1413 PetscFunctionReturn(PETSC_SUCCESS); 1414 } 1415 1416 /******************************************************************************/ 1417 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1418 { 1419 PetscInt size; 1420 PetscInt *in, *out; 1421 PetscScalar *buf, *work; 1422 PetscInt op[] = {GL_ADD, 0}; 1423 1424 PetscFunctionBegin; 1425 in = gs->tree_map_in; 1426 out = gs->tree_map_out; 1427 buf = gs->tree_buf; 1428 work = gs->tree_work; 1429 size = gs->tree_nel; 1430 1431 PetscCall(PCTFS_rvec_zero(buf, size)); 1432 1433 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1434 1435 in = gs->tree_map_in; 1436 out = gs->tree_map_out; 1437 1438 PetscCall(PCTFS_grop_hc(buf, work, size, op, dim)); 1439 1440 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1441 PetscFunctionReturn(PETSC_SUCCESS); 1442 } 1443