1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 /***********************************gs.c*************************************** 30 Type: struct gather_scatter_id 31 ------------------------------ 32 33 ************************************gs.c**************************************/ 34 typedef struct gather_scatter_id { 35 PetscInt id; 36 PetscInt nel_min; 37 PetscInt nel_max; 38 PetscInt nel_sum; 39 PetscInt negl; 40 PetscInt gl_max; 41 PetscInt gl_min; 42 PetscInt repeats; 43 PetscInt ordered; 44 PetscInt positive; 45 PetscScalar *vals; 46 47 /* bit mask info */ 48 PetscInt *my_proc_mask; 49 PetscInt mask_sz; 50 PetscInt *ngh_buf; 51 PetscInt ngh_buf_sz; 52 PetscInt *nghs; 53 PetscInt num_nghs; 54 PetscInt max_nghs; 55 PetscInt *pw_nghs; 56 PetscInt num_pw_nghs; 57 PetscInt *tree_nghs; 58 PetscInt num_tree_nghs; 59 60 PetscInt num_loads; 61 62 /* repeats == true -> local info */ 63 PetscInt nel; /* number of unique elememts */ 64 PetscInt *elms; /* of size nel */ 65 PetscInt nel_total; 66 PetscInt *local_elms; /* of size nel_total */ 67 PetscInt *companion; /* of size nel_total */ 68 69 /* local info */ 70 PetscInt num_local_total; 71 PetscInt local_strength; 72 PetscInt num_local; 73 PetscInt *num_local_reduce; 74 PetscInt **local_reduce; 75 PetscInt num_local_gop; 76 PetscInt *num_gop_local_reduce; 77 PetscInt **gop_local_reduce; 78 79 /* pairwise info */ 80 PetscInt level; 81 PetscInt num_pairs; 82 PetscInt max_pairs; 83 PetscInt loc_node_pairs; 84 PetscInt max_node_pairs; 85 PetscInt min_node_pairs; 86 PetscInt avg_node_pairs; 87 PetscInt *pair_list; 88 PetscInt *msg_sizes; 89 PetscInt **node_list; 90 PetscInt len_pw_list; 91 PetscInt *pw_elm_list; 92 PetscScalar *pw_vals; 93 94 MPI_Request *msg_ids_in; 95 MPI_Request *msg_ids_out; 96 97 PetscScalar *out; 98 PetscScalar *in; 99 PetscInt msg_total; 100 101 /* tree - crystal accumulator info */ 102 PetscInt max_left_over; 103 PetscInt *pre; 104 PetscInt *in_num; 105 PetscInt *out_num; 106 PetscInt **in_list; 107 PetscInt **out_list; 108 109 /* new tree work*/ 110 PetscInt tree_nel; 111 PetscInt *tree_elms; 112 PetscScalar *tree_buf; 113 PetscScalar *tree_work; 114 115 PetscInt tree_map_sz; 116 PetscInt *tree_map_in; 117 PetscInt *tree_map_out; 118 119 /* current memory status */ 120 PetscInt gl_bss_min; 121 PetscInt gl_perm_min; 122 123 /* max segment size for PCTFS_gs_gop_vec() */ 124 PetscInt vec_sz; 125 126 /* hack to make paul happy */ 127 MPI_Comm PCTFS_gs_comm; 128 129 } PCTFS_gs_id; 130 131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 135 static PCTFS_gs_id *gsi_new(void); 136 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 137 138 /* same for all but vector flavor */ 139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 140 /* vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 151 152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 155 156 /* global vars */ 157 /* from comm.c module */ 158 159 static PetscInt num_gs_ids = 0; 160 161 /* should make this dynamic ... later */ 162 static PetscInt msg_buf = MAX_MSG_BUF; 163 static PetscInt vec_sz = GS_VEC_SZ; 164 static PetscInt *tree_buf = NULL; 165 static PetscInt tree_buf_sz = 0; 166 static PetscInt ntree = 0; 167 168 /***************************************************************************/ 169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 170 { 171 PetscFunctionBegin; 172 vec_sz = size; 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 176 /******************************************************************************/ 177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 178 { 179 PetscFunctionBegin; 180 msg_buf = buf_size; 181 PetscFunctionReturn(PETSC_SUCCESS); 182 } 183 184 /******************************************************************************/ 185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 186 { 187 PCTFS_gs_id *gs; 188 MPI_Group PCTFS_gs_group; 189 MPI_Comm PCTFS_gs_comm; 190 191 /* ensure that communication package has been initialized */ 192 PetscCallAbort(PETSC_COMM_SELF, PCTFS_comm_init()); 193 194 /* determines if we have enough dynamic/semi-static memory */ 195 /* checks input, allocs and sets gd_id template */ 196 gs = gsi_check_args(elms, nel, level); 197 198 /* only bit mask version up and working for the moment */ 199 /* LATER :: get int list version working for sparse pblms */ 200 PetscCallAbort(PETSC_COMM_WORLD, gsi_via_bit_mask(gs)); 201 202 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_group(MPI_COMM_WORLD, &PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 203 PetscCallAbort(PETSC_COMM_WORLD, MPI_Comm_create(MPI_COMM_WORLD, PCTFS_gs_group, &PCTFS_gs_comm) ? PETSC_ERR_MPI : PETSC_SUCCESS); 204 PetscCallAbort(PETSC_COMM_WORLD, MPI_Group_free(&PCTFS_gs_group) ? PETSC_ERR_MPI : PETSC_SUCCESS); 205 206 gs->PCTFS_gs_comm = PCTFS_gs_comm; 207 208 return (gs); 209 } 210 211 /******************************************************************************/ 212 static PCTFS_gs_id *gsi_new(void) 213 { 214 PCTFS_gs_id *gs; 215 gs = (PCTFS_gs_id *)malloc(sizeof(PCTFS_gs_id)); 216 PetscCallAbort(PETSC_COMM_WORLD, PetscMemzero(gs, sizeof(PCTFS_gs_id))); 217 return (gs); 218 } 219 220 /******************************************************************************/ 221 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 222 { 223 PetscInt i, j, k, t2; 224 PetscInt *companion, *elms, *unique, *iptr; 225 PetscInt num_local = 0, *num_to_reduce, **local_reduce; 226 PetscInt oprs[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_MIN, GL_B_AND}; 227 PetscInt vals[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 228 PetscInt work[PETSC_STATIC_ARRAY_LENGTH(oprs) - 1]; 229 PCTFS_gs_id *gs; 230 231 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "elms point to nothing!!!\n"); 232 if (nel < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "can't have fewer than 0 elms!!!\n"); 233 234 if (nel == 0) PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "I don't have any elements!!!\n")); 235 236 /* get space for gs template */ 237 gs = gsi_new(); 238 gs->id = ++num_gs_ids; 239 240 /* hmt 6.4.99 */ 241 /* caller can set global ids that don't participate to 0 */ 242 /* PCTFS_gs_init ignores all zeros in elm list */ 243 /* negative global ids are still invalid */ 244 for (i = j = 0; i < nel; i++) { 245 if (in_elms[i] != 0) j++; 246 } 247 248 k = nel; 249 nel = j; 250 251 /* copy over in_elms list and create inverse map */ 252 elms = (PetscInt *)malloc((nel + 1) * sizeof(PetscInt)); 253 companion = (PetscInt *)malloc(nel * sizeof(PetscInt)); 254 255 for (i = j = 0; i < k; i++) { 256 if (in_elms[i] != 0) { 257 elms[j] = in_elms[i]; 258 companion[j++] = i; 259 } 260 } 261 262 if (j != nel) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "nel j mismatch!\n"); 263 264 /* pre-pass ... check to see if sorted */ 265 elms[nel] = INT_MAX; 266 iptr = elms; 267 unique = elms + 1; 268 j = 0; 269 while (*iptr != INT_MAX) { 270 if (*iptr++ > *unique++) { 271 j = 1; 272 break; 273 } 274 } 275 276 /* set up inverse map */ 277 if (j) { 278 PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list *not* sorted!\n")); 279 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_SMI_sort((void *)elms, (void *)companion, nel, SORT_INTEGER)); 280 } else PetscCallAbort(PETSC_COMM_WORLD, PetscInfo(0, "gsi_check_args() :: elm list sorted!\n")); 281 elms[nel] = INT_MIN; 282 283 /* first pass */ 284 /* determine number of unique elements, check pd */ 285 for (i = k = 0; i < nel; i += j) { 286 t2 = elms[i]; 287 j = ++i; 288 289 /* clump 'em for now */ 290 while (elms[j] == t2) j++; 291 292 /* how many together and num local */ 293 if (j -= i) { 294 num_local++; 295 k += j; 296 } 297 } 298 299 /* how many unique elements? */ 300 gs->repeats = k; 301 gs->nel = nel - k; 302 303 /* number of repeats? */ 304 gs->num_local = num_local; 305 num_local += 2; 306 gs->local_reduce = local_reduce = (PetscInt **)malloc(num_local * sizeof(PetscInt *)); 307 gs->num_local_reduce = num_to_reduce = (PetscInt *)malloc(num_local * sizeof(PetscInt)); 308 309 unique = (PetscInt *)malloc((gs->nel + 1) * sizeof(PetscInt)); 310 gs->elms = unique; 311 gs->nel_total = nel; 312 gs->local_elms = elms; 313 gs->companion = companion; 314 315 /* compess map as well as keep track of local ops */ 316 for (num_local = i = j = 0; i < gs->nel; i++) { 317 k = j; 318 t2 = unique[i] = elms[j]; 319 companion[i] = companion[j]; 320 321 while (elms[j] == t2) j++; 322 323 if ((t2 = (j - k)) > 1) { 324 /* number together */ 325 num_to_reduce[num_local] = t2++; 326 327 iptr = local_reduce[num_local++] = (PetscInt *)malloc(t2 * sizeof(PetscInt)); 328 329 /* to use binary searching don't remap until we check intersection */ 330 *iptr++ = i; 331 332 /* note that we're skipping the first one */ 333 while (++k < j) *(iptr++) = companion[k]; 334 *iptr = -1; 335 } 336 } 337 338 /* sentinel for ngh_buf */ 339 unique[gs->nel] = INT_MAX; 340 341 /* for two partition sort hack */ 342 num_to_reduce[num_local] = 0; 343 local_reduce[num_local] = NULL; 344 num_to_reduce[++num_local] = 0; 345 local_reduce[num_local] = NULL; 346 347 /* load 'em up */ 348 /* note one extra to hold NON_UNIFORM flag!!! */ 349 vals[2] = vals[1] = vals[0] = nel; 350 if (gs->nel > 0) { 351 vals[3] = unique[0]; 352 vals[4] = unique[gs->nel - 1]; 353 } else { 354 vals[3] = INT_MAX; 355 vals[4] = INT_MIN; 356 } 357 vals[5] = level; 358 vals[6] = num_gs_ids; 359 360 /* GLOBAL: send 'em out */ 361 PetscCallAbort(PETSC_COMM_WORLD, PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(oprs) - 1, oprs)); 362 363 /* must be semi-pos def - only pairwise depends on this */ 364 /* LATER - remove this restriction */ 365 if (vals[3] < 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system not semi-pos def \n"); 366 if (vals[4] == INT_MAX) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system ub too large !\n"); 367 368 gs->nel_min = vals[0]; 369 gs->nel_max = vals[1]; 370 gs->nel_sum = vals[2]; 371 gs->gl_min = vals[3]; 372 gs->gl_max = vals[4]; 373 gs->negl = vals[4] - vals[3] + 1; 374 375 if (gs->negl <= 0) SETERRABORT(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "gsi_check_args() :: system empty or neg :: %" PetscInt_FMT "\n", gs->negl); 376 377 /* LATER :: add level == -1 -> program selects level */ 378 if (vals[5] < 0) vals[5] = 0; 379 else if (vals[5] > PCTFS_num_nodes) vals[5] = PCTFS_num_nodes; 380 gs->level = vals[5]; 381 382 return (gs); 383 } 384 385 /******************************************************************************/ 386 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 387 { 388 PetscInt i, nel, *elms; 389 PetscInt t1; 390 PetscInt **reduce; 391 PetscInt *map; 392 393 PetscFunctionBegin; 394 /* totally local removes ... PCTFS_ct_bits == 0 */ 395 PetscCall(get_ngh_buf(gs)); 396 397 if (gs->level) PetscCall(set_pairwise(gs)); 398 if (gs->max_left_over) PetscCall(set_tree(gs)); 399 400 /* intersection local and pairwise/tree? */ 401 gs->num_local_total = gs->num_local; 402 gs->gop_local_reduce = gs->local_reduce; 403 gs->num_gop_local_reduce = gs->num_local_reduce; 404 405 map = gs->companion; 406 407 /* is there any local compression */ 408 if (!gs->num_local) { 409 gs->local_strength = NONE; 410 gs->num_local_gop = 0; 411 } else { 412 /* ok find intersection */ 413 map = gs->companion; 414 reduce = gs->local_reduce; 415 for (i = 0, t1 = 0; i < gs->num_local; i++, reduce++) { 416 if ((PCTFS_ivec_binary_search(**reduce, gs->pw_elm_list, gs->len_pw_list) >= 0) || PCTFS_ivec_binary_search(**reduce, gs->tree_map_in, gs->tree_map_sz) >= 0) { 417 t1++; 418 PetscCheck(gs->num_local_reduce[i] > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nobody in list?"); 419 gs->num_local_reduce[i] *= -1; 420 } 421 **reduce = map[**reduce]; 422 } 423 424 /* intersection is empty */ 425 if (!t1) { 426 gs->local_strength = FULL; 427 gs->num_local_gop = 0; 428 } else { /* intersection not empty */ 429 gs->local_strength = PARTIAL; 430 431 PetscCall(PCTFS_SMI_sort((void *)gs->num_local_reduce, (void *)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR)); 432 433 gs->num_local_gop = t1; 434 gs->num_local_total = gs->num_local; 435 gs->num_local -= t1; 436 gs->gop_local_reduce = gs->local_reduce; 437 gs->num_gop_local_reduce = gs->num_local_reduce; 438 439 for (i = 0; i < t1; i++) { 440 PetscCheck(gs->num_gop_local_reduce[i] < 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "they aren't negative?"); 441 gs->num_gop_local_reduce[i] *= -1; 442 gs->local_reduce++; 443 gs->num_local_reduce++; 444 } 445 gs->local_reduce++; 446 gs->num_local_reduce++; 447 } 448 } 449 450 elms = gs->pw_elm_list; 451 nel = gs->len_pw_list; 452 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 453 454 elms = gs->tree_map_in; 455 nel = gs->tree_map_sz; 456 for (i = 0; i < nel; i++) elms[i] = map[elms[i]]; 457 458 /* clean up */ 459 free((void *)gs->local_elms); 460 free((void *)gs->companion); 461 free((void *)gs->elms); 462 free((void *)gs->ngh_buf); 463 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 464 PetscFunctionReturn(PETSC_SUCCESS); 465 } 466 467 /******************************************************************************/ 468 static PetscErrorCode place_in_tree(PetscInt elm) 469 { 470 PetscInt *tp, n; 471 472 PetscFunctionBegin; 473 if (ntree == tree_buf_sz) { 474 if (tree_buf_sz) { 475 tp = tree_buf; 476 n = tree_buf_sz; 477 tree_buf_sz <<= 1; 478 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 479 PCTFS_ivec_copy(tree_buf, tp, n); 480 free(tp); 481 } else { 482 tree_buf_sz = TREE_BUF_SZ; 483 tree_buf = (PetscInt *)malloc(tree_buf_sz * sizeof(PetscInt)); 484 } 485 } 486 487 tree_buf[ntree++] = elm; 488 PetscFunctionReturn(PETSC_SUCCESS); 489 } 490 491 /******************************************************************************/ 492 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 493 { 494 PetscInt i, j, npw = 0, ntree_map = 0; 495 PetscInt p_mask_size, ngh_buf_size, buf_size; 496 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 497 PetscInt *ngh_buf, *buf1, *buf2; 498 PetscInt offset, per_load, num_loads, or_ct, start, end; 499 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 500 PetscInt oper = GL_B_OR; 501 PetscInt *ptr3, *t_mask, level, ct1, ct2; 502 503 PetscFunctionBegin; 504 /* to make life easier */ 505 nel = gs->nel; 506 elms = gs->elms; 507 level = gs->level; 508 509 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 510 p_mask = (PetscInt *)malloc(p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes)); 511 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 512 513 /* allocate space for masks and info bufs */ 514 gs->nghs = sh_proc_mask = (PetscInt *)malloc(p_mask_size); 515 gs->pw_nghs = pw_sh_proc_mask = (PetscInt *)malloc(p_mask_size); 516 gs->ngh_buf_sz = ngh_buf_size = p_mask_size * nel; 517 t_mask = (PetscInt *)malloc(p_mask_size); 518 gs->ngh_buf = ngh_buf = (PetscInt *)malloc(ngh_buf_size); 519 520 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 521 /* had thought I could exploit rendezvous threshold */ 522 523 /* default is one pass */ 524 per_load = negl = gs->negl; 525 gs->num_loads = num_loads = 1; 526 i = p_mask_size * negl; 527 528 /* possible overflow on buffer size */ 529 /* overflow hack */ 530 if (i < 0) i = INT_MAX; 531 532 buf_size = PetscMin(msg_buf, i); 533 534 /* can we do it? */ 535 PetscCheck(p_mask_size <= buf_size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "get_ngh_buf() :: buf<pms :: %" PetscInt_FMT ">%" PetscInt_FMT, p_mask_size, buf_size); 536 537 /* get PCTFS_giop buf space ... make *only* one malloc */ 538 buf1 = (PetscInt *)malloc(buf_size << 1); 539 540 /* more than one gior exchange needed? */ 541 if (buf_size != i) { 542 per_load = buf_size / p_mask_size; 543 buf_size = per_load * p_mask_size; 544 gs->num_loads = num_loads = negl / per_load + (negl % per_load > 0); 545 } 546 547 /* convert buf sizes from #bytes to #ints - 32-bit only! */ 548 p_mask_size /= sizeof(PetscInt); 549 ngh_buf_size /= sizeof(PetscInt); 550 buf_size /= sizeof(PetscInt); 551 552 /* find PCTFS_giop work space */ 553 buf2 = buf1 + buf_size; 554 555 /* hold #ints needed for processor masks */ 556 gs->mask_sz = p_mask_size; 557 558 /* init buffers */ 559 PetscCall(PCTFS_ivec_zero(sh_proc_mask, p_mask_size)); 560 PetscCall(PCTFS_ivec_zero(pw_sh_proc_mask, p_mask_size)); 561 PetscCall(PCTFS_ivec_zero(ngh_buf, ngh_buf_size)); 562 563 /* HACK reset tree info */ 564 tree_buf = NULL; 565 tree_buf_sz = ntree = 0; 566 567 /* ok do it */ 568 for (ptr1 = ngh_buf, ptr2 = elms, end = gs->gl_min, or_ct = i = 0; or_ct < num_loads; or_ct++) { 569 /* identity for bitwise or is 000...000 */ 570 PetscCall(PCTFS_ivec_zero(buf1, buf_size)); 571 572 /* load msg buffer */ 573 for (start = end, end += per_load, i_start = i; (offset = *ptr2) < end; i++, ptr2++) { 574 offset = (offset - start) * p_mask_size; 575 PCTFS_ivec_copy(buf1 + offset, p_mask, p_mask_size); 576 } 577 578 /* GLOBAL: pass buffer */ 579 PetscCall(PCTFS_giop(buf1, buf2, buf_size, &oper)); 580 581 /* unload buffer into ngh_buf */ 582 ptr2 = (elms + i_start); 583 for (ptr3 = buf1, j = start; j < end; ptr3 += p_mask_size, j++) { 584 /* I own it ... may have to pairwise it */ 585 if (j == *ptr2) { 586 /* do i share it w/anyone? */ 587 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 588 /* guess not */ 589 if (ct1 < 2) { 590 ptr2++; 591 ptr1 += p_mask_size; 592 continue; 593 } 594 595 /* i do ... so keep info and turn off my bit */ 596 PCTFS_ivec_copy(ptr1, ptr3, p_mask_size); 597 PetscCall(PCTFS_ivec_xor(ptr1, p_mask, p_mask_size)); 598 PetscCall(PCTFS_ivec_or(sh_proc_mask, ptr1, p_mask_size)); 599 600 /* is it to be done pairwise? */ 601 if (--ct1 <= level) { 602 npw++; 603 604 /* turn on high bit to indicate pw need to process */ 605 *ptr2++ |= TOP_BIT; 606 PetscCall(PCTFS_ivec_or(pw_sh_proc_mask, ptr1, p_mask_size)); 607 ptr1 += p_mask_size; 608 continue; 609 } 610 611 /* get set for next and note that I have a tree contribution */ 612 /* could save exact elm index for tree here -> save a search */ 613 ptr2++; 614 ptr1 += p_mask_size; 615 ntree_map++; 616 } else { /* i don't but still might be involved in tree */ 617 618 /* shared by how many? */ 619 ct1 = PCTFS_ct_bits((char *)ptr3, p_mask_size * sizeof(PetscInt)); 620 621 /* none! */ 622 if (ct1 < 2) continue; 623 624 /* is it going to be done pairwise? but not by me of course!*/ 625 if (--ct1 <= level) continue; 626 } 627 /* LATER we're going to have to process it NOW */ 628 /* nope ... tree it */ 629 PetscCall(place_in_tree(j)); 630 } 631 } 632 633 free((void *)t_mask); 634 free((void *)buf1); 635 636 gs->len_pw_list = npw; 637 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 638 639 /* expand from bit mask list to int list and save ngh list */ 640 gs->nghs = (PetscInt *)malloc(gs->num_nghs * sizeof(PetscInt)); 641 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), gs->nghs)); 642 643 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask, p_mask_size * sizeof(PetscInt)); 644 645 oper = GL_MAX; 646 ct1 = gs->num_nghs; 647 PetscCall(PCTFS_giop(&ct1, &ct2, 1, &oper)); 648 gs->max_nghs = ct1; 649 650 gs->tree_map_sz = ntree_map; 651 gs->max_left_over = ntree; 652 653 free((void *)p_mask); 654 free((void *)sh_proc_mask); 655 PetscFunctionReturn(PETSC_SUCCESS); 656 } 657 658 /******************************************************************************/ 659 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 660 { 661 PetscInt i, j; 662 PetscInt p_mask_size; 663 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 664 PetscInt *ngh_buf, *buf2; 665 PetscInt offset; 666 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 667 PetscInt *pairwise_elm_list, len_pair_list = 0; 668 PetscInt *iptr, t1, i_start, nel, *elms; 669 PetscInt ct; 670 671 PetscFunctionBegin; 672 /* to make life easier */ 673 nel = gs->nel; 674 elms = gs->elms; 675 ngh_buf = gs->ngh_buf; 676 sh_proc_mask = gs->pw_nghs; 677 678 /* need a few temp masks */ 679 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 680 p_mask = (PetscInt *)malloc(p_mask_size); 681 tmp_proc_mask = (PetscInt *)malloc(p_mask_size); 682 683 /* set mask to my PCTFS_my_id's bit mask */ 684 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size, PCTFS_my_id)); 685 686 p_mask_size /= sizeof(PetscInt); 687 688 len_pair_list = gs->len_pw_list; 689 gs->pw_elm_list = pairwise_elm_list = (PetscInt *)malloc((len_pair_list + 1) * sizeof(PetscInt)); 690 691 /* how many processors (nghs) do we have to exchange with? */ 692 nprs = gs->num_pairs = PCTFS_ct_bits((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt)); 693 694 /* allocate space for PCTFS_gs_gop() info */ 695 gs->pair_list = msg_list = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 696 gs->msg_sizes = msg_size = (PetscInt *)malloc(sizeof(PetscInt) * nprs); 697 gs->node_list = msg_nodes = (PetscInt **)malloc(sizeof(PetscInt *) * (nprs + 1)); 698 699 /* init msg_size list */ 700 PetscCall(PCTFS_ivec_zero(msg_size, nprs)); 701 702 /* expand from bit mask list to int list */ 703 PetscCall(PCTFS_bm_to_proc((char *)sh_proc_mask, p_mask_size * sizeof(PetscInt), msg_list)); 704 705 /* keep list of elements being handled pairwise */ 706 for (i = j = 0; i < nel; i++) { 707 if (elms[i] & TOP_BIT) { 708 elms[i] ^= TOP_BIT; 709 pairwise_elm_list[j++] = i; 710 } 711 } 712 pairwise_elm_list[j] = -1; 713 714 gs->msg_ids_out = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 715 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 716 gs->msg_ids_in = (MPI_Request *)malloc(sizeof(MPI_Request) * (nprs + 1)); 717 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 718 gs->pw_vals = (PetscScalar *)malloc(sizeof(PetscScalar) * len_pair_list * vec_sz); 719 720 /* find who goes to each processor */ 721 for (i_start = i = 0; i < nprs; i++) { 722 /* processor i's mask */ 723 PetscCall(PCTFS_set_bit_mask(p_mask, p_mask_size * sizeof(PetscInt), msg_list[i])); 724 725 /* det # going to processor i */ 726 for (ct = j = 0; j < len_pair_list; j++) { 727 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 728 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 729 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) ct++; 730 } 731 msg_size[i] = ct; 732 i_start = PetscMax(i_start, ct); 733 734 /*space to hold nodes in message to first neighbor */ 735 msg_nodes[i] = iptr = (PetscInt *)malloc(sizeof(PetscInt) * (ct + 1)); 736 737 for (j = 0; j < len_pair_list; j++) { 738 buf2 = ngh_buf + (pairwise_elm_list[j] * p_mask_size); 739 PetscCall(PCTFS_ivec_and3(tmp_proc_mask, p_mask, buf2, p_mask_size)); 740 if (PCTFS_ct_bits((char *)tmp_proc_mask, p_mask_size * sizeof(PetscInt))) *iptr++ = j; 741 } 742 *iptr = -1; 743 } 744 msg_nodes[nprs] = NULL; 745 746 j = gs->loc_node_pairs = i_start; 747 t1 = GL_MAX; 748 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 749 gs->max_node_pairs = i_start; 750 751 i_start = j; 752 t1 = GL_MIN; 753 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 754 gs->min_node_pairs = i_start; 755 756 i_start = j; 757 t1 = GL_ADD; 758 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 759 gs->avg_node_pairs = i_start / PCTFS_num_nodes + 1; 760 761 i_start = nprs; 762 t1 = GL_MAX; 763 PetscCall(PCTFS_giop(&i_start, &offset, 1, &t1)); 764 gs->max_pairs = i_start; 765 766 /* remap pairwise in tail of gsi_via_bit_mask() */ 767 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes, nprs); 768 gs->out = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 769 gs->in = (PetscScalar *)malloc(sizeof(PetscScalar) * gs->msg_total * vec_sz); 770 771 /* reset malloc pool */ 772 free((void *)p_mask); 773 free((void *)tmp_proc_mask); 774 PetscFunctionReturn(PETSC_SUCCESS); 775 } 776 777 /* to do pruned tree just save ngh buf copy for each one and decode here! 778 ******************************************************************************/ 779 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 780 { 781 PetscInt i, j, n, nel; 782 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 783 784 PetscFunctionBegin; 785 /* local work ptrs */ 786 elms = gs->elms; 787 nel = gs->nel; 788 789 /* how many via tree */ 790 gs->tree_nel = n = ntree; 791 gs->tree_elms = tree_elms = iptr_in = tree_buf; 792 gs->tree_buf = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 793 gs->tree_work = (PetscScalar *)malloc(sizeof(PetscScalar) * n * vec_sz); 794 j = gs->tree_map_sz; 795 gs->tree_map_in = iptr_in = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 796 gs->tree_map_out = iptr_out = (PetscInt *)malloc(sizeof(PetscInt) * (j + 1)); 797 798 /* search the longer of the two lists */ 799 /* note ... could save this info in get_ngh_buf and save searches */ 800 if (n <= nel) { 801 /* bijective fct w/remap - search elm list */ 802 for (i = 0; i < n; i++) { 803 if ((j = PCTFS_ivec_binary_search(*tree_elms++, elms, nel)) >= 0) { 804 *iptr_in++ = j; 805 *iptr_out++ = i; 806 } 807 } 808 } else { 809 for (i = 0; i < nel; i++) { 810 if ((j = PCTFS_ivec_binary_search(*elms++, tree_elms, n)) >= 0) { 811 *iptr_in++ = i; 812 *iptr_out++ = j; 813 } 814 } 815 } 816 817 /* sentinel */ 818 *iptr_in = *iptr_out = -1; 819 PetscFunctionReturn(PETSC_SUCCESS); 820 } 821 822 /******************************************************************************/ 823 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 824 { 825 PetscInt *num, *map, **reduce; 826 PetscScalar tmp; 827 828 PetscFunctionBegin; 829 num = gs->num_gop_local_reduce; 830 reduce = gs->gop_local_reduce; 831 while ((map = *reduce++)) { 832 /* wall */ 833 if (*num == 2) { 834 num++; 835 vals[map[1]] = vals[map[0]]; 836 } else if (*num == 3) { /* corner shared by three elements */ 837 num++; 838 vals[map[2]] = vals[map[1]] = vals[map[0]]; 839 } else if (*num == 4) { /* corner shared by four elements */ 840 num++; 841 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 842 } else { /* general case ... odd geoms ... 3D*/ 843 num++; 844 tmp = *(vals + *map++); 845 while (*map >= 0) *(vals + *map++) = tmp; 846 } 847 } 848 PetscFunctionReturn(PETSC_SUCCESS); 849 } 850 851 /******************************************************************************/ 852 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 853 { 854 PetscInt *num, *map, **reduce; 855 PetscScalar tmp; 856 857 PetscFunctionBegin; 858 num = gs->num_local_reduce; 859 reduce = gs->local_reduce; 860 while ((map = *reduce)) { 861 /* wall */ 862 if (*num == 2) { 863 num++; 864 reduce++; 865 vals[map[1]] = vals[map[0]] += vals[map[1]]; 866 } else if (*num == 3) { /* corner shared by three elements */ 867 num++; 868 reduce++; 869 vals[map[2]] = vals[map[1]] = vals[map[0]] += (vals[map[1]] + vals[map[2]]); 870 } else if (*num == 4) { /* corner shared by four elements */ 871 num++; 872 reduce++; 873 vals[map[1]] = vals[map[2]] = vals[map[3]] = vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 874 } else { /* general case ... odd geoms ... 3D*/ 875 num++; 876 tmp = 0.0; 877 while (*map >= 0) tmp += *(vals + *map++); 878 879 map = *reduce++; 880 while (*map >= 0) *(vals + *map++) = tmp; 881 } 882 } 883 PetscFunctionReturn(PETSC_SUCCESS); 884 } 885 886 /******************************************************************************/ 887 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 888 { 889 PetscInt *num, *map, **reduce; 890 PetscScalar *base; 891 892 PetscFunctionBegin; 893 num = gs->num_gop_local_reduce; 894 reduce = gs->gop_local_reduce; 895 while ((map = *reduce++)) { 896 /* wall */ 897 if (*num == 2) { 898 num++; 899 vals[map[0]] += vals[map[1]]; 900 } else if (*num == 3) { /* corner shared by three elements */ 901 num++; 902 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 903 } else if (*num == 4) { /* corner shared by four elements */ 904 num++; 905 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 906 } else { /* general case ... odd geoms ... 3D*/ 907 num++; 908 base = vals + *map++; 909 while (*map >= 0) *base += *(vals + *map++); 910 } 911 } 912 PetscFunctionReturn(PETSC_SUCCESS); 913 } 914 915 /******************************************************************************/ 916 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 917 { 918 PetscInt i; 919 920 PetscFunctionBegin; 921 PetscCallMPI(MPI_Comm_free(&gs->PCTFS_gs_comm)); 922 if (gs->nghs) free((void *)gs->nghs); 923 if (gs->pw_nghs) free((void *)gs->pw_nghs); 924 925 /* tree */ 926 if (gs->max_left_over) { 927 if (gs->tree_elms) free((void *)gs->tree_elms); 928 if (gs->tree_buf) free((void *)gs->tree_buf); 929 if (gs->tree_work) free((void *)gs->tree_work); 930 if (gs->tree_map_in) free((void *)gs->tree_map_in); 931 if (gs->tree_map_out) free((void *)gs->tree_map_out); 932 } 933 934 /* pairwise info */ 935 if (gs->num_pairs) { 936 /* should be NULL already */ 937 if (gs->ngh_buf) free((void *)gs->ngh_buf); 938 if (gs->elms) free((void *)gs->elms); 939 if (gs->local_elms) free((void *)gs->local_elms); 940 if (gs->companion) free((void *)gs->companion); 941 942 /* only set if pairwise */ 943 if (gs->vals) free((void *)gs->vals); 944 if (gs->in) free((void *)gs->in); 945 if (gs->out) free((void *)gs->out); 946 if (gs->msg_ids_in) free((void *)gs->msg_ids_in); 947 if (gs->msg_ids_out) free((void *)gs->msg_ids_out); 948 if (gs->pw_vals) free((void *)gs->pw_vals); 949 if (gs->pw_elm_list) free((void *)gs->pw_elm_list); 950 if (gs->node_list) { 951 for (i = 0; i < gs->num_pairs; i++) { 952 if (gs->node_list[i]) free((void *)gs->node_list[i]); 953 } 954 free((void *)gs->node_list); 955 } 956 if (gs->msg_sizes) free((void *)gs->msg_sizes); 957 if (gs->pair_list) free((void *)gs->pair_list); 958 } 959 960 /* local info */ 961 if (gs->num_local_total >= 0) { 962 for (i = 0; i < gs->num_local_total + 1; i++) { 963 if (gs->num_gop_local_reduce[i]) free((void *)gs->gop_local_reduce[i]); 964 } 965 } 966 967 /* if intersection tree/pairwise and local isn't empty */ 968 if (gs->gop_local_reduce) free((void *)gs->gop_local_reduce); 969 if (gs->num_gop_local_reduce) free((void *)gs->num_gop_local_reduce); 970 971 free((void *)gs); 972 PetscFunctionReturn(PETSC_SUCCESS); 973 } 974 975 /******************************************************************************/ 976 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 977 { 978 PetscFunctionBegin; 979 switch (*op) { 980 case '+': 981 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 982 break; 983 default: 984 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: %c is not a valid op\n", op[0])); 985 PetscCall(PetscInfo(0, "PCTFS_gs_gop_vec() :: default :: plus\n")); 986 PetscCall(PCTFS_gs_gop_vec_plus(gs, vals, step)); 987 break; 988 } 989 PetscFunctionReturn(PETSC_SUCCESS); 990 } 991 992 /******************************************************************************/ 993 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 994 { 995 PetscFunctionBegin; 996 PetscCheck(gs, PETSC_COMM_SELF, PETSC_ERR_PLIB, "PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 997 998 /* local only operations!!! */ 999 if (gs->num_local) PetscCall(PCTFS_gs_gop_vec_local_plus(gs, vals, step)); 1000 1001 /* if intersection tree/pairwise and local isn't empty */ 1002 if (gs->num_local_gop) { 1003 PetscCall(PCTFS_gs_gop_vec_local_in_plus(gs, vals, step)); 1004 1005 /* pairwise */ 1006 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1007 1008 /* tree */ 1009 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1010 1011 PetscCall(PCTFS_gs_gop_vec_local_out(gs, vals, step)); 1012 } else { /* if intersection tree/pairwise and local is empty */ 1013 /* pairwise */ 1014 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_vec_pairwise_plus(gs, vals, step)); 1015 1016 /* tree */ 1017 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, vals, step)); 1018 } 1019 PetscFunctionReturn(PETSC_SUCCESS); 1020 } 1021 1022 /******************************************************************************/ 1023 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1024 { 1025 PetscInt *num, *map, **reduce; 1026 PetscScalar *base; 1027 1028 PetscFunctionBegin; 1029 num = gs->num_local_reduce; 1030 reduce = gs->local_reduce; 1031 while ((map = *reduce)) { 1032 base = vals + map[0] * step; 1033 1034 /* wall */ 1035 if (*num == 2) { 1036 num++; 1037 reduce++; 1038 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1039 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1040 } else if (*num == 3) { /* corner shared by three elements */ 1041 num++; 1042 reduce++; 1043 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1044 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1045 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1046 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1047 } else if (*num == 4) { /* corner shared by four elements */ 1048 num++; 1049 reduce++; 1050 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1051 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1052 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1053 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1054 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1055 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1056 } else { /* general case ... odd geoms ... 3D */ 1057 num++; 1058 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1059 1060 map = *reduce; 1061 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1062 1063 reduce++; 1064 } 1065 } 1066 PetscFunctionReturn(PETSC_SUCCESS); 1067 } 1068 1069 /******************************************************************************/ 1070 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1071 { 1072 PetscInt *num, *map, **reduce; 1073 PetscScalar *base; 1074 1075 PetscFunctionBegin; 1076 num = gs->num_gop_local_reduce; 1077 reduce = gs->gop_local_reduce; 1078 while ((map = *reduce++)) { 1079 base = vals + map[0] * step; 1080 1081 /* wall */ 1082 if (*num == 2) { 1083 num++; 1084 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1085 } else if (*num == 3) { /* corner shared by three elements */ 1086 num++; 1087 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1088 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1089 } else if (*num == 4) { /* corner shared by four elements */ 1090 num++; 1091 PetscCall(PCTFS_rvec_add(base, vals + map[1] * step, step)); 1092 PetscCall(PCTFS_rvec_add(base, vals + map[2] * step, step)); 1093 PetscCall(PCTFS_rvec_add(base, vals + map[3] * step, step)); 1094 } else { /* general case ... odd geoms ... 3D*/ 1095 num++; 1096 while (*++map >= 0) PetscCall(PCTFS_rvec_add(base, vals + *map * step, step)); 1097 } 1098 } 1099 PetscFunctionReturn(PETSC_SUCCESS); 1100 } 1101 1102 /******************************************************************************/ 1103 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1104 { 1105 PetscInt *num, *map, **reduce; 1106 PetscScalar *base; 1107 1108 PetscFunctionBegin; 1109 num = gs->num_gop_local_reduce; 1110 reduce = gs->gop_local_reduce; 1111 while ((map = *reduce++)) { 1112 base = vals + map[0] * step; 1113 1114 /* wall */ 1115 if (*num == 2) { 1116 num++; 1117 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1118 } else if (*num == 3) { /* corner shared by three elements */ 1119 num++; 1120 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1121 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1122 } else if (*num == 4) { /* corner shared by four elements */ 1123 num++; 1124 PetscCall(PCTFS_rvec_copy(vals + map[1] * step, base, step)); 1125 PetscCall(PCTFS_rvec_copy(vals + map[2] * step, base, step)); 1126 PetscCall(PCTFS_rvec_copy(vals + map[3] * step, base, step)); 1127 } else { /* general case ... odd geoms ... 3D*/ 1128 num++; 1129 while (*++map >= 0) PetscCall(PCTFS_rvec_copy(vals + *map * step, base, step)); 1130 } 1131 } 1132 PetscFunctionReturn(PETSC_SUCCESS); 1133 } 1134 1135 /******************************************************************************/ 1136 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1137 { 1138 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1139 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1140 PetscInt *pw, *list, *size, **nodes; 1141 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1142 MPI_Status status; 1143 PetscBLASInt i1 = 1, dstep; 1144 1145 PetscFunctionBegin; 1146 /* strip and load s */ 1147 msg_list = list = gs->pair_list; 1148 msg_size = size = gs->msg_sizes; 1149 msg_nodes = nodes = gs->node_list; 1150 iptr = pw = gs->pw_elm_list; 1151 dptr1 = dptr3 = gs->pw_vals; 1152 msg_ids_in = ids_in = gs->msg_ids_in; 1153 msg_ids_out = ids_out = gs->msg_ids_out; 1154 dptr2 = gs->out; 1155 in1 = in2 = gs->in; 1156 1157 /* post the receives */ 1158 /* msg_nodes=nodes; */ 1159 do { 1160 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1161 second one *list and do list++ afterwards */ 1162 PetscCallMPI(MPI_Irecv(in1, *size * step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1163 list++; 1164 msg_ids_in++; 1165 in1 += *size++ * step; 1166 } while (*++msg_nodes); 1167 msg_nodes = nodes; 1168 1169 /* load gs values into in out gs buffers */ 1170 while (*iptr >= 0) { 1171 PetscCall(PCTFS_rvec_copy(dptr3, in_vals + *iptr * step, step)); 1172 dptr3 += step; 1173 iptr++; 1174 } 1175 1176 /* load out buffers and post the sends */ 1177 while ((iptr = *msg_nodes++)) { 1178 dptr3 = dptr2; 1179 while (*iptr >= 0) { 1180 PetscCall(PCTFS_rvec_copy(dptr2, dptr1 + *iptr * step, step)); 1181 dptr2 += step; 1182 iptr++; 1183 } 1184 PetscCallMPI(MPI_Isend(dptr3, *msg_size * step, MPIU_SCALAR, *msg_list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1185 msg_size++; 1186 msg_list++; 1187 msg_ids_out++; 1188 } 1189 1190 /* tree */ 1191 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_vec_tree_plus(gs, in_vals, step)); 1192 1193 /* process the received data */ 1194 msg_nodes = nodes; 1195 while ((iptr = *nodes++)) { 1196 PetscScalar d1 = 1.0; 1197 1198 /* Should I check the return value of MPI_Wait() or status? */ 1199 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1200 PetscCallMPI(MPI_Wait(ids_in, &status)); 1201 ids_in++; 1202 while (*iptr >= 0) { 1203 PetscCall(PetscBLASIntCast(step, &dstep)); 1204 PetscCallBLAS("BLASaxpy", BLASaxpy_(&dstep, &d1, in2, &i1, dptr1 + *iptr * step, &i1)); 1205 in2 += step; 1206 iptr++; 1207 } 1208 } 1209 1210 /* replace vals */ 1211 while (*pw >= 0) { 1212 PetscCall(PCTFS_rvec_copy(in_vals + *pw * step, dptr1, step)); 1213 dptr1 += step; 1214 pw++; 1215 } 1216 1217 /* clear isend message handles */ 1218 /* This changed for clarity though it could be the same */ 1219 1220 /* Should I check the return value of MPI_Wait() or status? */ 1221 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1222 while (*msg_nodes++) { 1223 PetscCallMPI(MPI_Wait(ids_out, &status)); 1224 ids_out++; 1225 } 1226 PetscFunctionReturn(PETSC_SUCCESS); 1227 } 1228 1229 /******************************************************************************/ 1230 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1231 { 1232 PetscInt size, *in, *out; 1233 PetscScalar *buf, *work; 1234 PetscInt op[] = {GL_ADD, 0}; 1235 PetscBLASInt i1 = 1; 1236 PetscBLASInt dstep; 1237 1238 PetscFunctionBegin; 1239 /* copy over to local variables */ 1240 in = gs->tree_map_in; 1241 out = gs->tree_map_out; 1242 buf = gs->tree_buf; 1243 work = gs->tree_work; 1244 size = gs->tree_nel * step; 1245 1246 /* zero out collection buffer */ 1247 PetscCall(PCTFS_rvec_zero(buf, size)); 1248 1249 /* copy over my contributions */ 1250 while (*in >= 0) { 1251 PetscCall(PetscBLASIntCast(step, &dstep)); 1252 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, vals + *in++ * step, &i1, buf + *out++ * step, &i1)); 1253 } 1254 1255 /* perform fan in/out on full buffer */ 1256 /* must change PCTFS_grop to handle the blas */ 1257 PetscCall(PCTFS_grop(buf, work, size, op)); 1258 1259 /* reset */ 1260 in = gs->tree_map_in; 1261 out = gs->tree_map_out; 1262 1263 /* get the portion of the results I need */ 1264 while (*in >= 0) { 1265 PetscCall(PetscBLASIntCast(step, &dstep)); 1266 PetscCallBLAS("BLAScopy", BLAScopy_(&dstep, buf + *out++ * step, &i1, vals + *in++ * step, &i1)); 1267 } 1268 PetscFunctionReturn(PETSC_SUCCESS); 1269 } 1270 1271 /******************************************************************************/ 1272 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1273 { 1274 PetscFunctionBegin; 1275 switch (*op) { 1276 case '+': 1277 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1278 break; 1279 default: 1280 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: %c is not a valid op\n", op[0])); 1281 PetscCall(PetscInfo(0, "PCTFS_gs_gop_hc() :: default :: plus\n")); 1282 PetscCall(PCTFS_gs_gop_plus_hc(gs, vals, dim)); 1283 break; 1284 } 1285 PetscFunctionReturn(PETSC_SUCCESS); 1286 } 1287 1288 /******************************************************************************/ 1289 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1290 { 1291 PetscFunctionBegin; 1292 /* if there's nothing to do return */ 1293 if (dim <= 0) PetscFunctionReturn(PETSC_SUCCESS); 1294 1295 /* can't do more dimensions then exist */ 1296 dim = PetscMin(dim, PCTFS_i_log2_num_nodes); 1297 1298 /* local only operations!!! */ 1299 if (gs->num_local) PetscCall(PCTFS_gs_gop_local_plus(gs, vals)); 1300 1301 /* if intersection tree/pairwise and local isn't empty */ 1302 if (gs->num_local_gop) { 1303 PetscCall(PCTFS_gs_gop_local_in_plus(gs, vals)); 1304 1305 /* pairwise will do tree inside ... */ 1306 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree only */ 1307 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1308 1309 PetscCall(PCTFS_gs_gop_local_out(gs, vals)); 1310 } else { /* if intersection tree/pairwise and local is empty */ 1311 /* pairwise will do tree inside */ 1312 if (gs->num_pairs) PetscCall(PCTFS_gs_gop_pairwise_plus_hc(gs, vals, dim)); /* tree */ 1313 else if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, vals, dim)); 1314 } 1315 PetscFunctionReturn(PETSC_SUCCESS); 1316 } 1317 1318 /******************************************************************************/ 1319 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1320 { 1321 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1322 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1323 PetscInt *pw, *list, *size, **nodes; 1324 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1325 MPI_Status status; 1326 PetscInt i, mask = 1; 1327 1328 PetscFunctionBegin; 1329 for (i = 1; i < dim; i++) { 1330 mask <<= 1; 1331 mask++; 1332 } 1333 1334 /* strip and load s */ 1335 msg_list = list = gs->pair_list; 1336 msg_size = size = gs->msg_sizes; 1337 msg_nodes = nodes = gs->node_list; 1338 iptr = pw = gs->pw_elm_list; 1339 dptr1 = dptr3 = gs->pw_vals; 1340 msg_ids_in = ids_in = gs->msg_ids_in; 1341 msg_ids_out = ids_out = gs->msg_ids_out; 1342 dptr2 = gs->out; 1343 in1 = in2 = gs->in; 1344 1345 /* post the receives */ 1346 /* msg_nodes=nodes; */ 1347 do { 1348 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1349 second one *list and do list++ afterwards */ 1350 if ((PCTFS_my_id | mask) == (*list | mask)) { 1351 PetscCallMPI(MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in)); 1352 list++; 1353 msg_ids_in++; 1354 in1 += *size++; 1355 } else { 1356 list++; 1357 size++; 1358 } 1359 } while (*++msg_nodes); 1360 1361 /* load gs values into in out gs buffers */ 1362 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1363 1364 /* load out buffers and post the sends */ 1365 msg_nodes = nodes; 1366 list = msg_list; 1367 while ((iptr = *msg_nodes++)) { 1368 if ((PCTFS_my_id | mask) == (*list | mask)) { 1369 dptr3 = dptr2; 1370 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1371 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1372 /* is msg_ids_out++ correct? */ 1373 PetscCallMPI(MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1 + PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out)); 1374 msg_size++; 1375 list++; 1376 msg_ids_out++; 1377 } else { 1378 list++; 1379 msg_size++; 1380 } 1381 } 1382 1383 /* do the tree while we're waiting */ 1384 if (gs->max_left_over) PetscCall(PCTFS_gs_gop_tree_plus_hc(gs, in_vals, dim)); 1385 1386 /* process the received data */ 1387 msg_nodes = nodes; 1388 list = msg_list; 1389 while ((iptr = *nodes++)) { 1390 if ((PCTFS_my_id | mask) == (*list | mask)) { 1391 /* Should I check the return value of MPI_Wait() or status? */ 1392 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1393 PetscCallMPI(MPI_Wait(ids_in, &status)); 1394 ids_in++; 1395 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1396 } 1397 list++; 1398 } 1399 1400 /* replace vals */ 1401 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1402 1403 /* clear isend message handles */ 1404 /* This changed for clarity though it could be the same */ 1405 while (*msg_nodes++) { 1406 if ((PCTFS_my_id | mask) == (*msg_list | mask)) { 1407 /* Should I check the return value of MPI_Wait() or status? */ 1408 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1409 PetscCallMPI(MPI_Wait(ids_out, &status)); 1410 ids_out++; 1411 } 1412 msg_list++; 1413 } 1414 PetscFunctionReturn(PETSC_SUCCESS); 1415 } 1416 1417 /******************************************************************************/ 1418 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1419 { 1420 PetscInt size; 1421 PetscInt *in, *out; 1422 PetscScalar *buf, *work; 1423 PetscInt op[] = {GL_ADD, 0}; 1424 1425 PetscFunctionBegin; 1426 in = gs->tree_map_in; 1427 out = gs->tree_map_out; 1428 buf = gs->tree_buf; 1429 work = gs->tree_work; 1430 size = gs->tree_nel; 1431 1432 PetscCall(PCTFS_rvec_zero(buf, size)); 1433 1434 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1435 1436 in = gs->tree_map_in; 1437 out = gs->tree_map_out; 1438 1439 PetscCall(PCTFS_grop_hc(buf, work, size, op, dim)); 1440 1441 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1442 PetscFunctionReturn(PETSC_SUCCESS); 1443 } 1444