1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 30 31 /***********************************gs.c*************************************** 32 Type: struct gather_scatter_id 33 ------------------------------ 34 35 ************************************gs.c**************************************/ 36 typedef struct gather_scatter_id { 37 PetscInt id; 38 PetscInt nel_min; 39 PetscInt nel_max; 40 PetscInt nel_sum; 41 PetscInt negl; 42 PetscInt gl_max; 43 PetscInt gl_min; 44 PetscInt repeats; 45 PetscInt ordered; 46 PetscInt positive; 47 PetscScalar *vals; 48 49 /* bit mask info */ 50 PetscInt *my_proc_mask; 51 PetscInt mask_sz; 52 PetscInt *ngh_buf; 53 PetscInt ngh_buf_sz; 54 PetscInt *nghs; 55 PetscInt num_nghs; 56 PetscInt max_nghs; 57 PetscInt *pw_nghs; 58 PetscInt num_pw_nghs; 59 PetscInt *tree_nghs; 60 PetscInt num_tree_nghs; 61 62 PetscInt num_loads; 63 64 /* repeats == true -> local info */ 65 PetscInt nel; /* number of unique elememts */ 66 PetscInt *elms; /* of size nel */ 67 PetscInt nel_total; 68 PetscInt *local_elms; /* of size nel_total */ 69 PetscInt *companion; /* of size nel_total */ 70 71 /* local info */ 72 PetscInt num_local_total; 73 PetscInt local_strength; 74 PetscInt num_local; 75 PetscInt *num_local_reduce; 76 PetscInt **local_reduce; 77 PetscInt num_local_gop; 78 PetscInt *num_gop_local_reduce; 79 PetscInt **gop_local_reduce; 80 81 /* pairwise info */ 82 PetscInt level; 83 PetscInt num_pairs; 84 PetscInt max_pairs; 85 PetscInt loc_node_pairs; 86 PetscInt max_node_pairs; 87 PetscInt min_node_pairs; 88 PetscInt avg_node_pairs; 89 PetscInt *pair_list; 90 PetscInt *msg_sizes; 91 PetscInt **node_list; 92 PetscInt len_pw_list; 93 PetscInt *pw_elm_list; 94 PetscScalar *pw_vals; 95 96 MPI_Request *msg_ids_in; 97 MPI_Request *msg_ids_out; 98 99 PetscScalar *out; 100 PetscScalar *in; 101 PetscInt msg_total; 102 103 /* tree - crystal accumulator info */ 104 PetscInt max_left_over; 105 PetscInt *pre; 106 PetscInt *in_num; 107 PetscInt *out_num; 108 PetscInt **in_list; 109 PetscInt **out_list; 110 111 /* new tree work*/ 112 PetscInt tree_nel; 113 PetscInt *tree_elms; 114 PetscScalar *tree_buf; 115 PetscScalar *tree_work; 116 117 PetscInt tree_map_sz; 118 PetscInt *tree_map_in; 119 PetscInt *tree_map_out; 120 121 /* current memory status */ 122 PetscInt gl_bss_min; 123 PetscInt gl_perm_min; 124 125 /* max segment size for PCTFS_gs_gop_vec() */ 126 PetscInt vec_sz; 127 128 /* hack to make paul happy */ 129 MPI_Comm PCTFS_gs_comm; 130 131 } PCTFS_gs_id; 132 133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 137 static PCTFS_gs_id * gsi_new(void); 138 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 139 140 /* same for all but vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 142 /* vector flavor */ 143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 144 145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 150 151 152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 154 155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 158 159 /* global vars */ 160 /* from comm.c module */ 161 162 static PetscInt num_gs_ids = 0; 163 164 /* should make this dynamic ... later */ 165 static PetscInt msg_buf=MAX_MSG_BUF; 166 static PetscInt vec_sz=GS_VEC_SZ; 167 static PetscInt *tree_buf=NULL; 168 static PetscInt tree_buf_sz=0; 169 static PetscInt ntree=0; 170 171 /***************************************************************************/ 172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 173 { 174 PetscFunctionBegin; 175 vec_sz = size; 176 PetscFunctionReturn(0); 177 } 178 179 /******************************************************************************/ 180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 181 { 182 PetscFunctionBegin; 183 msg_buf = buf_size; 184 PetscFunctionReturn(0); 185 } 186 187 /******************************************************************************/ 188 PCTFS_gs_id *PCTFS_gs_init( PetscInt *elms, PetscInt nel, PetscInt level) 189 { 190 PCTFS_gs_id *gs; 191 MPI_Group PCTFS_gs_group; 192 MPI_Comm PCTFS_gs_comm; 193 PetscErrorCode ierr; 194 195 PetscFunctionBegin; 196 /* ensure that communication package has been initialized */ 197 PCTFS_comm_init(); 198 199 200 /* determines if we have enough dynamic/semi-static memory */ 201 /* checks input, allocs and sets gd_id template */ 202 gs = gsi_check_args(elms,nel,level); 203 204 /* only bit mask version up and working for the moment */ 205 /* LATER :: get int list version working for sparse pblms */ 206 ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 207 208 209 ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 210 ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr); 211 gs->PCTFS_gs_comm=PCTFS_gs_comm; 212 213 return(gs); 214 } 215 216 /******************************************************************************/ 217 static PCTFS_gs_id *gsi_new(void) 218 { 219 PetscErrorCode ierr; 220 PCTFS_gs_id *gs; 221 gs = (PCTFS_gs_id *) malloc(sizeof(PCTFS_gs_id)); 222 ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr); 223 return(gs); 224 } 225 226 /******************************************************************************/ 227 static PCTFS_gs_id * gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 228 { 229 PetscInt i, j, k, t2; 230 PetscInt *companion, *elms, *unique, *iptr; 231 PetscInt num_local=0, *num_to_reduce, **local_reduce; 232 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 233 PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; 234 PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; 235 PCTFS_gs_id *gs; 236 PetscErrorCode ierr; 237 238 239 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 240 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 241 242 if (nel==0) 243 {ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);} 244 245 /* get space for gs template */ 246 gs = gsi_new(); 247 gs->id = ++num_gs_ids; 248 249 /* hmt 6.4.99 */ 250 /* caller can set global ids that don't participate to 0 */ 251 /* PCTFS_gs_init ignores all zeros in elm list */ 252 /* negative global ids are still invalid */ 253 for (i=j=0;i<nel;i++) 254 {if (in_elms[i]!=0) {j++;}} 255 256 k=nel; nel=j; 257 258 /* copy over in_elms list and create inverse map */ 259 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 260 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 261 262 for (i=j=0;i<k;i++) 263 { 264 if (in_elms[i]!=0) 265 {elms[j] = in_elms[i]; companion[j++] = i;} 266 } 267 268 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 269 270 /* pre-pass ... check to see if sorted */ 271 elms[nel] = INT_MAX; 272 iptr = elms; 273 unique = elms+1; 274 j=0; 275 while (*iptr!=INT_MAX) 276 { 277 if (*iptr++>*unique++) 278 {j=1; break;} 279 } 280 281 /* set up inverse map */ 282 if (j) 283 { 284 ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); 285 ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); 286 } 287 else 288 {ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr);} 289 elms[nel] = INT_MIN; 290 291 /* first pass */ 292 /* determine number of unique elements, check pd */ 293 for (i=k=0;i<nel;i+=j) 294 { 295 t2 = elms[i]; 296 j=++i; 297 298 /* clump 'em for now */ 299 while (elms[j]==t2) {j++;} 300 301 /* how many together and num local */ 302 if (j-=i) 303 {num_local++; k+=j;} 304 } 305 306 /* how many unique elements? */ 307 gs->repeats=k; 308 gs->nel = nel-k; 309 310 311 /* number of repeats? */ 312 gs->num_local = num_local; 313 num_local+=2; 314 gs->local_reduce=local_reduce=(PetscInt **)malloc(num_local*sizeof(PetscInt*)); 315 gs->num_local_reduce=num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 316 317 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 318 gs->elms = unique; 319 gs->nel_total = nel; 320 gs->local_elms = elms; 321 gs->companion = companion; 322 323 /* compess map as well as keep track of local ops */ 324 for (num_local=i=j=0;i<gs->nel;i++) 325 { 326 k=j; 327 t2 = unique[i] = elms[j]; 328 companion[i] = companion[j]; 329 330 while (elms[j]==t2) {j++;} 331 332 if ((t2=(j-k))>1) 333 { 334 /* number together */ 335 num_to_reduce[num_local] = t2++; 336 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 337 338 /* to use binary searching don't remap until we check intersection */ 339 *iptr++ = i; 340 341 /* note that we're skipping the first one */ 342 while (++k<j) 343 {*(iptr++) = companion[k];} 344 *iptr = -1; 345 } 346 } 347 348 /* sentinel for ngh_buf */ 349 unique[gs->nel]=INT_MAX; 350 351 /* for two partition sort hack */ 352 num_to_reduce[num_local] = 0; 353 local_reduce[num_local] = NULL; 354 num_to_reduce[++num_local] = 0; 355 local_reduce[num_local] = NULL; 356 357 /* load 'em up */ 358 /* note one extra to hold NON_UNIFORM flag!!! */ 359 vals[2] = vals[1] = vals[0] = nel; 360 if (gs->nel>0) 361 { 362 vals[3] = unique[0]; 363 vals[4] = unique[gs->nel-1]; 364 } 365 else 366 { 367 vals[3] = INT_MAX; 368 vals[4] = INT_MIN; 369 } 370 vals[5] = level; 371 vals[6] = num_gs_ids; 372 373 /* GLOBAL: send 'em out */ 374 ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 375 376 /* must be semi-pos def - only pairwise depends on this */ 377 /* LATER - remove this restriction */ 378 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 379 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 380 381 gs->nel_min = vals[0]; 382 gs->nel_max = vals[1]; 383 gs->nel_sum = vals[2]; 384 gs->gl_min = vals[3]; 385 gs->gl_max = vals[4]; 386 gs->negl = vals[4]-vals[3]+1; 387 388 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 389 390 /* LATER :: add level == -1 -> program selects level */ 391 if (vals[5]<0) 392 {vals[5]=0;} 393 else if (vals[5]>PCTFS_num_nodes) 394 {vals[5]=PCTFS_num_nodes;} 395 gs->level = vals[5]; 396 397 return(gs); 398 } 399 400 /******************************************************************************/ 401 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 402 { 403 PetscInt i, nel, *elms; 404 PetscInt t1; 405 PetscInt **reduce; 406 PetscInt *map; 407 PetscErrorCode ierr; 408 409 PetscFunctionBegin; 410 /* totally local removes ... PCTFS_ct_bits == 0 */ 411 get_ngh_buf(gs); 412 413 if (gs->level) set_pairwise(gs); 414 if (gs->max_left_over) set_tree(gs); 415 416 /* intersection local and pairwise/tree? */ 417 gs->num_local_total = gs->num_local; 418 gs->gop_local_reduce = gs->local_reduce; 419 gs->num_gop_local_reduce = gs->num_local_reduce; 420 421 map = gs->companion; 422 423 /* is there any local compression */ 424 if (!gs->num_local) { 425 gs->local_strength = NONE; 426 gs->num_local_gop = 0; 427 } else { 428 /* ok find intersection */ 429 map = gs->companion; 430 reduce = gs->local_reduce; 431 for (i=0, t1=0; i<gs->num_local; i++, reduce++) 432 { 433 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) 434 || 435 PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) 436 { 437 t1++; 438 if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 439 gs->num_local_reduce[i] *= -1; 440 } 441 **reduce=map[**reduce]; 442 } 443 444 /* intersection is empty */ 445 if (!t1) 446 { 447 gs->local_strength = FULL; 448 gs->num_local_gop = 0; 449 } 450 /* intersection not empty */ 451 else 452 { 453 gs->local_strength = PARTIAL; 454 ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr); 455 456 gs->num_local_gop = t1; 457 gs->num_local_total = gs->num_local; 458 gs->num_local -= t1; 459 gs->gop_local_reduce = gs->local_reduce; 460 gs->num_gop_local_reduce = gs->num_local_reduce; 461 462 for (i=0; i<t1; i++) 463 { 464 if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 465 gs->num_gop_local_reduce[i] *= -1; 466 gs->local_reduce++; 467 gs->num_local_reduce++; 468 } 469 gs->local_reduce++; 470 gs->num_local_reduce++; 471 } 472 } 473 474 elms = gs->pw_elm_list; 475 nel = gs->len_pw_list; 476 for (i=0; i<nel; i++) 477 {elms[i] = map[elms[i]];} 478 479 elms = gs->tree_map_in; 480 nel = gs->tree_map_sz; 481 for (i=0; i<nel; i++) 482 {elms[i] = map[elms[i]];} 483 484 /* clean up */ 485 free((void*) gs->local_elms); 486 free((void*) gs->companion); 487 free((void*) gs->elms); 488 free((void*) gs->ngh_buf); 489 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 490 PetscFunctionReturn(0); 491 } 492 493 /******************************************************************************/ 494 static PetscErrorCode place_in_tree( PetscInt elm) 495 { 496 PetscInt *tp, n; 497 498 PetscFunctionBegin; 499 if (ntree==tree_buf_sz) 500 { 501 if (tree_buf_sz) 502 { 503 tp = tree_buf; 504 n = tree_buf_sz; 505 tree_buf_sz<<=1; 506 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 507 PCTFS_ivec_copy(tree_buf,tp,n); 508 free(tp); 509 } 510 else 511 { 512 tree_buf_sz = TREE_BUF_SZ; 513 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 514 } 515 } 516 517 tree_buf[ntree++] = elm; 518 PetscFunctionReturn(0); 519 } 520 521 /******************************************************************************/ 522 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 523 { 524 PetscInt i, j, npw=0, ntree_map=0; 525 PetscInt p_mask_size, ngh_buf_size, buf_size; 526 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 527 PetscInt *ngh_buf, *buf1, *buf2; 528 PetscInt offset, per_load, num_loads, or_ct, start, end; 529 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 530 PetscInt oper=GL_B_OR; 531 PetscInt *ptr3, *t_mask, level, ct1, ct2; 532 PetscErrorCode ierr; 533 534 PetscFunctionBegin; 535 /* to make life easier */ 536 nel = gs->nel; 537 elms = gs->elms; 538 level = gs->level; 539 540 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 541 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 542 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 543 544 /* allocate space for masks and info bufs */ 545 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 546 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 547 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 548 t_mask = (PetscInt*) malloc(p_mask_size); 549 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 550 551 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 552 /* had thought I could exploit rendezvous threshold */ 553 554 /* default is one pass */ 555 per_load = negl = gs->negl; 556 gs->num_loads = num_loads = 1; 557 i=p_mask_size*negl; 558 559 /* possible overflow on buffer size */ 560 /* overflow hack */ 561 if (i<0) {i=INT_MAX;} 562 563 buf_size = PetscMin(msg_buf,i); 564 565 /* can we do it? */ 566 if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); 567 568 /* get PCTFS_giop buf space ... make *only* one malloc */ 569 buf1 = (PetscInt*) malloc(buf_size<<1); 570 571 /* more than one gior exchange needed? */ 572 if (buf_size!=i) 573 { 574 per_load = buf_size/p_mask_size; 575 buf_size = per_load*p_mask_size; 576 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 577 } 578 579 580 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 581 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 582 583 /* find PCTFS_giop work space */ 584 buf2 = buf1+buf_size; 585 586 /* hold #ints needed for processor masks */ 587 gs->mask_sz=p_mask_size; 588 589 /* init buffers */ 590 ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); 591 ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); 592 ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); 593 594 /* HACK reset tree info */ 595 tree_buf=NULL; 596 tree_buf_sz=ntree=0; 597 598 /* ok do it */ 599 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) 600 { 601 /* identity for bitwise or is 000...000 */ 602 PCTFS_ivec_zero(buf1,buf_size); 603 604 /* load msg buffer */ 605 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) 606 { 607 offset = (offset-start)*p_mask_size; 608 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 609 } 610 611 /* GLOBAL: pass buffer */ 612 ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); 613 614 615 /* unload buffer into ngh_buf */ 616 ptr2=(elms+i_start); 617 for(ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) 618 { 619 /* I own it ... may have to pairwise it */ 620 if (j==*ptr2) 621 { 622 /* do i share it w/anyone? */ 623 ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); 624 /* guess not */ 625 if (ct1<2) 626 {ptr2++; ptr1+=p_mask_size; continue;} 627 628 /* i do ... so keep info and turn off my bit */ 629 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 630 ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); 631 ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 632 633 /* is it to be done pairwise? */ 634 if (--ct1<=level) 635 { 636 npw++; 637 638 /* turn on high bit to indicate pw need to process */ 639 *ptr2++ |= TOP_BIT; 640 ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 641 ptr1+=p_mask_size; 642 continue; 643 } 644 645 /* get set for next and note that I have a tree contribution */ 646 /* could save exact elm index for tree here -> save a search */ 647 ptr2++; ptr1+=p_mask_size; ntree_map++; 648 } 649 /* i don't but still might be involved in tree */ 650 else 651 { 652 653 /* shared by how many? */ 654 ct1 = PCTFS_ct_bits((char *)ptr3,p_mask_size*sizeof(PetscInt)); 655 656 /* none! */ 657 if (ct1<2) continue; 658 659 /* is it going to be done pairwise? but not by me of course!*/ 660 if (--ct1<=level) continue; 661 } 662 /* LATER we're going to have to process it NOW */ 663 /* nope ... tree it */ 664 ierr = place_in_tree(j);CHKERRQ(ierr); 665 } 666 } 667 668 free((void*)t_mask); 669 free((void*)buf1); 670 671 gs->len_pw_list=npw; 672 gs->num_nghs = PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 673 674 /* expand from bit mask list to int list and save ngh list */ 675 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 676 PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 677 678 gs->num_pw_nghs = PCTFS_ct_bits((char *)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 679 680 oper = GL_MAX; 681 ct1 = gs->num_nghs; 682 ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); 683 gs->max_nghs = ct1; 684 685 gs->tree_map_sz = ntree_map; 686 gs->max_left_over=ntree; 687 688 free((void*)p_mask); 689 free((void*)sh_proc_mask); 690 PetscFunctionReturn(0); 691 } 692 693 /******************************************************************************/ 694 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 695 { 696 PetscInt i, j; 697 PetscInt p_mask_size; 698 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 699 PetscInt *ngh_buf, *buf2; 700 PetscInt offset; 701 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 702 PetscInt *pairwise_elm_list, len_pair_list=0; 703 PetscInt *iptr, t1, i_start, nel, *elms; 704 PetscInt ct; 705 PetscErrorCode ierr; 706 707 PetscFunctionBegin; 708 /* to make life easier */ 709 nel = gs->nel; 710 elms = gs->elms; 711 ngh_buf = gs->ngh_buf; 712 sh_proc_mask = gs->pw_nghs; 713 714 /* need a few temp masks */ 715 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 716 p_mask = (PetscInt*) malloc(p_mask_size); 717 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 718 719 /* set mask to my PCTFS_my_id's bit mask */ 720 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 721 722 p_mask_size /= sizeof(PetscInt); 723 724 len_pair_list=gs->len_pw_list; 725 gs->pw_elm_list=pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 726 727 /* how many processors (nghs) do we have to exchange with? */ 728 nprs=gs->num_pairs=PCTFS_ct_bits((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 729 730 731 /* allocate space for PCTFS_gs_gop() info */ 732 gs->pair_list = msg_list = (PetscInt *) malloc(sizeof(PetscInt)*nprs); 733 gs->msg_sizes = msg_size = (PetscInt *) malloc(sizeof(PetscInt)*nprs); 734 gs->node_list = msg_nodes = (PetscInt **) malloc(sizeof(PetscInt*)*(nprs+1)); 735 736 /* init msg_size list */ 737 ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr); 738 739 /* expand from bit mask list to int list */ 740 ierr = PCTFS_bm_to_proc((char *)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); 741 742 /* keep list of elements being handled pairwise */ 743 for (i=j=0;i<nel;i++) 744 { 745 if (elms[i] & TOP_BIT) 746 {elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i;} 747 } 748 pairwise_elm_list[j] = -1; 749 750 gs->msg_ids_out = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); 751 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 752 gs->msg_ids_in = (MPI_Request *) malloc(sizeof(MPI_Request)*(nprs+1)); 753 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 754 gs->pw_vals = (PetscScalar *) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 755 756 /* find who goes to each processor */ 757 for (i_start=i=0;i<nprs;i++) 758 { 759 /* processor i's mask */ 760 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); 761 762 /* det # going to processor i */ 763 for (ct=j=0;j<len_pair_list;j++) 764 { 765 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 766 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 767 if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) 768 {ct++;} 769 } 770 msg_size[i] = ct; 771 i_start = PetscMax(i_start,ct); 772 773 /*space to hold nodes in message to first neighbor */ 774 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 775 776 for (j=0;j<len_pair_list;j++) 777 { 778 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 779 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 780 if (PCTFS_ct_bits((char *)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) 781 {*iptr++ = j;} 782 } 783 *iptr = -1; 784 } 785 msg_nodes[nprs] = NULL; 786 787 j=gs->loc_node_pairs=i_start; 788 t1 = GL_MAX; 789 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 790 gs->max_node_pairs = i_start; 791 792 i_start=j; 793 t1 = GL_MIN; 794 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 795 gs->min_node_pairs = i_start; 796 797 i_start=j; 798 t1 = GL_ADD; 799 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 800 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 801 802 i_start=nprs; 803 t1 = GL_MAX; 804 PCTFS_giop(&i_start,&offset,1,&t1); 805 gs->max_pairs = i_start; 806 807 808 /* remap pairwise in tail of gsi_via_bit_mask() */ 809 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 810 gs->out = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 811 gs->in = (PetscScalar *) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 812 813 /* reset malloc pool */ 814 free((void*)p_mask); 815 free((void*)tmp_proc_mask); 816 PetscFunctionReturn(0); 817 } 818 819 /* to do pruned tree just save ngh buf copy for each one and decode here! 820 ******************************************************************************/ 821 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 822 { 823 PetscInt i, j, n, nel; 824 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 825 826 PetscFunctionBegin; 827 /* local work ptrs */ 828 elms = gs->elms; 829 nel = gs->nel; 830 831 /* how many via tree */ 832 gs->tree_nel = n = ntree; 833 gs->tree_elms = tree_elms = iptr_in = tree_buf; 834 gs->tree_buf = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz); 835 gs->tree_work = (PetscScalar *) malloc(sizeof(PetscScalar)*n*vec_sz); 836 j=gs->tree_map_sz; 837 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 838 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 839 840 /* search the longer of the two lists */ 841 /* note ... could save this info in get_ngh_buf and save searches */ 842 if (n<=nel) 843 { 844 /* bijective fct w/remap - search elm list */ 845 for (i=0; i<n; i++) 846 { 847 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) 848 {*iptr_in++ = j; *iptr_out++ = i;} 849 } 850 } 851 else 852 { 853 for (i=0; i<nel; i++) 854 { 855 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) 856 {*iptr_in++ = i; *iptr_out++ = j;} 857 } 858 } 859 860 /* sentinel */ 861 *iptr_in = *iptr_out = -1; 862 PetscFunctionReturn(0); 863 } 864 865 /******************************************************************************/ 866 static PetscErrorCode PCTFS_gs_gop_local_out( PCTFS_gs_id *gs, PetscScalar *vals) 867 { 868 PetscInt *num, *map, **reduce; 869 PetscScalar tmp; 870 871 PetscFunctionBegin; 872 num = gs->num_gop_local_reduce; 873 reduce = gs->gop_local_reduce; 874 while ((map = *reduce++)) 875 { 876 /* wall */ 877 if (*num == 2) 878 { 879 num ++; 880 vals[map[1]] = vals[map[0]]; 881 } 882 /* corner shared by three elements */ 883 else if (*num == 3) 884 { 885 num ++; 886 vals[map[2]] = vals[map[1]] = vals[map[0]]; 887 } 888 /* corner shared by four elements */ 889 else if (*num == 4) 890 { 891 num ++; 892 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 893 } 894 /* general case ... odd geoms ... 3D*/ 895 else 896 { 897 num++; 898 tmp = *(vals + *map++); 899 while (*map >= 0) 900 {*(vals + *map++) = tmp;} 901 } 902 } 903 PetscFunctionReturn(0); 904 } 905 906 /******************************************************************************/ 907 static PetscErrorCode PCTFS_gs_gop_local_plus( PCTFS_gs_id *gs, PetscScalar *vals) 908 { 909 PetscInt *num, *map, **reduce; 910 PetscScalar tmp; 911 912 PetscFunctionBegin; 913 num = gs->num_local_reduce; 914 reduce = gs->local_reduce; 915 while ((map = *reduce)) 916 { 917 /* wall */ 918 if (*num == 2) 919 { 920 num ++; reduce++; 921 vals[map[1]] = vals[map[0]] += vals[map[1]]; 922 } 923 /* corner shared by three elements */ 924 else if (*num == 3) 925 { 926 num ++; reduce++; 927 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 928 } 929 /* corner shared by four elements */ 930 else if (*num == 4) 931 { 932 num ++; reduce++; 933 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += 934 (vals[map[1]] + vals[map[2]] + vals[map[3]]); 935 } 936 /* general case ... odd geoms ... 3D*/ 937 else 938 { 939 num ++; 940 tmp = 0.0; 941 while (*map >= 0) 942 {tmp += *(vals + *map++);} 943 944 map = *reduce++; 945 while (*map >= 0) 946 {*(vals + *map++) = tmp;} 947 } 948 } 949 PetscFunctionReturn(0); 950 } 951 952 /******************************************************************************/ 953 static PetscErrorCode PCTFS_gs_gop_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals) 954 { 955 PetscInt *num, *map, **reduce; 956 PetscScalar *base; 957 958 PetscFunctionBegin; 959 num = gs->num_gop_local_reduce; 960 reduce = gs->gop_local_reduce; 961 while ((map = *reduce++)) 962 { 963 /* wall */ 964 if (*num == 2) 965 { 966 num ++; 967 vals[map[0]] += vals[map[1]]; 968 } 969 /* corner shared by three elements */ 970 else if (*num == 3) 971 { 972 num ++; 973 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 974 } 975 /* corner shared by four elements */ 976 else if (*num == 4) 977 { 978 num ++; 979 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 980 } 981 /* general case ... odd geoms ... 3D*/ 982 else 983 { 984 num++; 985 base = vals + *map++; 986 while (*map >= 0) 987 {*base += *(vals + *map++);} 988 } 989 } 990 PetscFunctionReturn(0); 991 } 992 993 /******************************************************************************/ 994 PetscErrorCode PCTFS_gs_free( PCTFS_gs_id *gs) 995 { 996 PetscInt i; 997 998 PetscFunctionBegin; 999 if (gs->nghs) {free((void*) gs->nghs);} 1000 if (gs->pw_nghs) {free((void*) gs->pw_nghs);} 1001 1002 /* tree */ 1003 if (gs->max_left_over) 1004 { 1005 if (gs->tree_elms) {free((void*) gs->tree_elms);} 1006 if (gs->tree_buf) {free((void*) gs->tree_buf);} 1007 if (gs->tree_work) {free((void*) gs->tree_work);} 1008 if (gs->tree_map_in) {free((void*) gs->tree_map_in);} 1009 if (gs->tree_map_out) {free((void*) gs->tree_map_out);} 1010 } 1011 1012 /* pairwise info */ 1013 if (gs->num_pairs) 1014 { 1015 /* should be NULL already */ 1016 if (gs->ngh_buf) {free((void*) gs->ngh_buf);} 1017 if (gs->elms) {free((void*) gs->elms);} 1018 if (gs->local_elms) {free((void*) gs->local_elms);} 1019 if (gs->companion) {free((void*) gs->companion);} 1020 1021 /* only set if pairwise */ 1022 if (gs->vals) {free((void*) gs->vals);} 1023 if (gs->in) {free((void*) gs->in);} 1024 if (gs->out) {free((void*) gs->out);} 1025 if (gs->msg_ids_in) {free((void*) gs->msg_ids_in);} 1026 if (gs->msg_ids_out) {free((void*) gs->msg_ids_out);} 1027 if (gs->pw_vals) {free((void*) gs->pw_vals);} 1028 if (gs->pw_elm_list) {free((void*) gs->pw_elm_list);} 1029 if (gs->node_list) 1030 { 1031 for (i=0;i<gs->num_pairs;i++) 1032 {if (gs->node_list[i]) {free((void*) gs->node_list[i]);}} 1033 free((void*) gs->node_list); 1034 } 1035 if (gs->msg_sizes) {free((void*) gs->msg_sizes);} 1036 if (gs->pair_list) {free((void*) gs->pair_list);} 1037 } 1038 1039 /* local info */ 1040 if (gs->num_local_total>=0) 1041 { 1042 for (i=0;i<gs->num_local_total+1;i++) 1043 /* for (i=0;i<gs->num_local_total;i++) */ 1044 { 1045 if (gs->num_gop_local_reduce[i]) 1046 {free((void*) gs->gop_local_reduce[i]);} 1047 } 1048 } 1049 1050 /* if intersection tree/pairwise and local isn't empty */ 1051 if (gs->gop_local_reduce) {free((void*) gs->gop_local_reduce);} 1052 if (gs->num_gop_local_reduce) {free((void*) gs->num_gop_local_reduce);} 1053 1054 free((void*) gs); 1055 PetscFunctionReturn(0); 1056 } 1057 1058 /******************************************************************************/ 1059 PetscErrorCode PCTFS_gs_gop_vec( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 1060 { 1061 PetscErrorCode ierr; 1062 1063 PetscFunctionBegin; 1064 switch (*op) { 1065 case '+': 1066 PCTFS_gs_gop_vec_plus(gs,vals,step); 1067 break; 1068 default: 1069 ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op",op[0]);CHKERRQ(ierr); 1070 ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus");CHKERRQ(ierr); 1071 PCTFS_gs_gop_vec_plus(gs,vals,step); 1072 break; 1073 } 1074 PetscFunctionReturn(0); 1075 } 1076 1077 /******************************************************************************/ 1078 static PetscErrorCode PCTFS_gs_gop_vec_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1079 { 1080 PetscFunctionBegin; 1081 if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 1082 1083 /* local only operations!!! */ 1084 if (gs->num_local) 1085 {PCTFS_gs_gop_vec_local_plus(gs,vals,step);} 1086 1087 /* if intersection tree/pairwise and local isn't empty */ 1088 if (gs->num_local_gop) 1089 { 1090 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 1091 1092 /* pairwise */ 1093 if (gs->num_pairs) 1094 {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);} 1095 1096 /* tree */ 1097 else if (gs->max_left_over) 1098 {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);} 1099 1100 PCTFS_gs_gop_vec_local_out(gs,vals,step); 1101 } 1102 /* if intersection tree/pairwise and local is empty */ 1103 else 1104 { 1105 /* pairwise */ 1106 if (gs->num_pairs) 1107 {PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);} 1108 1109 /* tree */ 1110 else if (gs->max_left_over) 1111 {PCTFS_gs_gop_vec_tree_plus(gs,vals,step);} 1112 } 1113 PetscFunctionReturn(0); 1114 } 1115 1116 /******************************************************************************/ 1117 static PetscErrorCode PCTFS_gs_gop_vec_local_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1118 { 1119 PetscInt *num, *map, **reduce; 1120 PetscScalar *base; 1121 1122 PetscFunctionBegin; 1123 num = gs->num_local_reduce; 1124 reduce = gs->local_reduce; 1125 while ((map = *reduce)) 1126 { 1127 base = vals + map[0] * step; 1128 1129 /* wall */ 1130 if (*num == 2) 1131 { 1132 num++; reduce++; 1133 PCTFS_rvec_add (base,vals+map[1]*step,step); 1134 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1135 } 1136 /* corner shared by three elements */ 1137 else if (*num == 3) 1138 { 1139 num++; reduce++; 1140 PCTFS_rvec_add (base,vals+map[1]*step,step); 1141 PCTFS_rvec_add (base,vals+map[2]*step,step); 1142 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1143 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1144 } 1145 /* corner shared by four elements */ 1146 else if (*num == 4) 1147 { 1148 num++; reduce++; 1149 PCTFS_rvec_add (base,vals+map[1]*step,step); 1150 PCTFS_rvec_add (base,vals+map[2]*step,step); 1151 PCTFS_rvec_add (base,vals+map[3]*step,step); 1152 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1153 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1154 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1155 } 1156 /* general case ... odd geoms ... 3D */ 1157 else 1158 { 1159 num++; 1160 while (*++map >= 0) 1161 {PCTFS_rvec_add (base,vals+*map*step,step);} 1162 1163 map = *reduce; 1164 while (*++map >= 0) 1165 {PCTFS_rvec_copy(vals+*map*step,base,step);} 1166 1167 reduce++; 1168 } 1169 } 1170 PetscFunctionReturn(0); 1171 } 1172 1173 /******************************************************************************/ 1174 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1175 { 1176 PetscInt *num, *map, **reduce; 1177 PetscScalar *base; 1178 PetscFunctionBegin; 1179 num = gs->num_gop_local_reduce; 1180 reduce = gs->gop_local_reduce; 1181 while ((map = *reduce++)) 1182 { 1183 base = vals + map[0] * step; 1184 1185 /* wall */ 1186 if (*num == 2) 1187 { 1188 num ++; 1189 PCTFS_rvec_add(base,vals+map[1]*step,step); 1190 } 1191 /* corner shared by three elements */ 1192 else if (*num == 3) 1193 { 1194 num ++; 1195 PCTFS_rvec_add(base,vals+map[1]*step,step); 1196 PCTFS_rvec_add(base,vals+map[2]*step,step); 1197 } 1198 /* corner shared by four elements */ 1199 else if (*num == 4) 1200 { 1201 num ++; 1202 PCTFS_rvec_add(base,vals+map[1]*step,step); 1203 PCTFS_rvec_add(base,vals+map[2]*step,step); 1204 PCTFS_rvec_add(base,vals+map[3]*step,step); 1205 } 1206 /* general case ... odd geoms ... 3D*/ 1207 else 1208 { 1209 num++; 1210 while (*++map >= 0) 1211 {PCTFS_rvec_add(base,vals+*map*step,step);} 1212 } 1213 } 1214 PetscFunctionReturn(0); 1215 } 1216 1217 /******************************************************************************/ 1218 static PetscErrorCode PCTFS_gs_gop_vec_local_out( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1219 { 1220 PetscInt *num, *map, **reduce; 1221 PetscScalar *base; 1222 1223 PetscFunctionBegin; 1224 num = gs->num_gop_local_reduce; 1225 reduce = gs->gop_local_reduce; 1226 while ((map = *reduce++)) 1227 { 1228 base = vals + map[0] * step; 1229 1230 /* wall */ 1231 if (*num == 2) 1232 { 1233 num ++; 1234 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1235 } 1236 /* corner shared by three elements */ 1237 else if (*num == 3) 1238 { 1239 num ++; 1240 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1241 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1242 } 1243 /* corner shared by four elements */ 1244 else if (*num == 4) 1245 { 1246 num ++; 1247 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1248 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1249 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1250 } 1251 /* general case ... odd geoms ... 3D*/ 1252 else 1253 { 1254 num++; 1255 while (*++map >= 0) 1256 {PCTFS_rvec_copy(vals+*map*step,base,step);} 1257 } 1258 } 1259 PetscFunctionReturn(0); 1260 } 1261 1262 /******************************************************************************/ 1263 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1264 { 1265 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1266 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1267 PetscInt *pw, *list, *size, **nodes; 1268 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1269 MPI_Status status; 1270 PetscBLASInt i1 = 1,dstep; 1271 PetscErrorCode ierr; 1272 1273 PetscFunctionBegin; 1274 /* strip and load s */ 1275 msg_list =list = gs->pair_list; 1276 msg_size =size = gs->msg_sizes; 1277 msg_nodes=nodes = gs->node_list; 1278 iptr=pw = gs->pw_elm_list; 1279 dptr1=dptr3 = gs->pw_vals; 1280 msg_ids_in = ids_in = gs->msg_ids_in; 1281 msg_ids_out = ids_out = gs->msg_ids_out; 1282 dptr2 = gs->out; 1283 in1=in2 = gs->in; 1284 1285 /* post the receives */ 1286 /* msg_nodes=nodes; */ 1287 do 1288 { 1289 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1290 second one *list and do list++ afterwards */ 1291 ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1292 list++;msg_ids_in++; 1293 in1 += *size++ *step; 1294 } 1295 while (*++msg_nodes); 1296 msg_nodes=nodes; 1297 1298 /* load gs values into in out gs buffers */ 1299 while (*iptr >= 0) 1300 { 1301 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1302 dptr3+=step; 1303 iptr++; 1304 } 1305 1306 /* load out buffers and post the sends */ 1307 while ((iptr = *msg_nodes++)) 1308 { 1309 dptr3 = dptr2; 1310 while (*iptr >= 0) 1311 { 1312 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1313 dptr2+=step; 1314 iptr++; 1315 } 1316 ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1317 msg_size++; msg_list++;msg_ids_out++; 1318 } 1319 1320 /* tree */ 1321 if (gs->max_left_over) 1322 {PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);} 1323 1324 /* process the received data */ 1325 msg_nodes=nodes; 1326 while ((iptr = *nodes++)){ 1327 PetscScalar d1 = 1.0; 1328 /* Should I check the return value of MPI_Wait() or status? */ 1329 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1330 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1331 ids_in++; 1332 while (*iptr >= 0) { 1333 dstep = PetscBLASIntCast(step); 1334 BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1); 1335 in2+=step; 1336 iptr++; 1337 } 1338 } 1339 1340 /* replace vals */ 1341 while (*pw >= 0) 1342 { 1343 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1344 dptr1+=step; 1345 pw++; 1346 } 1347 1348 /* clear isend message handles */ 1349 /* This changed for clarity though it could be the same */ 1350 while (*msg_nodes++) 1351 /* Should I check the return value of MPI_Wait() or status? */ 1352 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1353 {ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr);ids_out++;} 1354 1355 1356 PetscFunctionReturn(0); 1357 } 1358 1359 /******************************************************************************/ 1360 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1361 { 1362 PetscInt size, *in, *out; 1363 PetscScalar *buf, *work; 1364 PetscInt op[] = {GL_ADD,0}; 1365 PetscBLASInt i1 = 1; 1366 1367 PetscFunctionBegin; 1368 /* copy over to local variables */ 1369 in = gs->tree_map_in; 1370 out = gs->tree_map_out; 1371 buf = gs->tree_buf; 1372 work = gs->tree_work; 1373 size = gs->tree_nel*step; 1374 1375 /* zero out collection buffer */ 1376 PCTFS_rvec_zero(buf,size); 1377 1378 1379 /* copy over my contributions */ 1380 while (*in >= 0) 1381 { 1382 PetscBLASInt dstep = PetscBLASIntCast(step); 1383 BLAScopy_(&dstep,vals + *in++*step,&i1,buf + *out++*step,&i1); 1384 } 1385 1386 /* perform fan in/out on full buffer */ 1387 /* must change PCTFS_grop to handle the blas */ 1388 PCTFS_grop(buf,work,size,op); 1389 1390 /* reset */ 1391 in = gs->tree_map_in; 1392 out = gs->tree_map_out; 1393 1394 /* get the portion of the results I need */ 1395 while (*in >= 0) 1396 { 1397 PetscBLASInt dstep = PetscBLASIntCast(step); 1398 BLAScopy_(&dstep,buf + *out++*step,&i1,vals + *in++*step,&i1); 1399 } 1400 PetscFunctionReturn(0); 1401 } 1402 1403 /******************************************************************************/ 1404 PetscErrorCode PCTFS_gs_gop_hc( PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1405 { 1406 PetscErrorCode ierr; 1407 1408 PetscFunctionBegin; 1409 switch (*op) { 1410 case '+': 1411 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1412 break; 1413 default: 1414 ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op",op[0]);CHKERRQ(ierr); 1415 ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr); 1416 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1417 break; 1418 } 1419 PetscFunctionReturn(0); 1420 } 1421 1422 /******************************************************************************/ 1423 static PetscErrorCode PCTFS_gs_gop_plus_hc( PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1424 { 1425 PetscFunctionBegin; 1426 /* if there's nothing to do return */ 1427 if (dim<=0) 1428 { PetscFunctionReturn(0);} 1429 1430 /* can't do more dimensions then exist */ 1431 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1432 1433 /* local only operations!!! */ 1434 if (gs->num_local) 1435 {PCTFS_gs_gop_local_plus(gs,vals);} 1436 1437 /* if intersection tree/pairwise and local isn't empty */ 1438 if (gs->num_local_gop) 1439 { 1440 PCTFS_gs_gop_local_in_plus(gs,vals); 1441 1442 /* pairwise will do tree inside ... */ 1443 if (gs->num_pairs) 1444 {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);} 1445 1446 /* tree only */ 1447 else if (gs->max_left_over) 1448 {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);} 1449 1450 PCTFS_gs_gop_local_out(gs,vals); 1451 } 1452 /* if intersection tree/pairwise and local is empty */ 1453 else 1454 { 1455 /* pairwise will do tree inside */ 1456 if (gs->num_pairs) 1457 {PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim);} 1458 1459 /* tree */ 1460 else if (gs->max_left_over) 1461 {PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);} 1462 } 1463 PetscFunctionReturn(0); 1464 } 1465 1466 /******************************************************************************/ 1467 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc( PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1468 { 1469 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1470 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1471 PetscInt *pw, *list, *size, **nodes; 1472 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1473 MPI_Status status; 1474 PetscInt i, mask=1; 1475 PetscErrorCode ierr; 1476 1477 PetscFunctionBegin; 1478 for (i=1; i<dim; i++) 1479 {mask<<=1; mask++;} 1480 1481 1482 /* strip and load s */ 1483 msg_list =list = gs->pair_list; 1484 msg_size =size = gs->msg_sizes; 1485 msg_nodes=nodes = gs->node_list; 1486 iptr=pw = gs->pw_elm_list; 1487 dptr1=dptr3 = gs->pw_vals; 1488 msg_ids_in = ids_in = gs->msg_ids_in; 1489 msg_ids_out = ids_out = gs->msg_ids_out; 1490 dptr2 = gs->out; 1491 in1=in2 = gs->in; 1492 1493 /* post the receives */ 1494 /* msg_nodes=nodes; */ 1495 do 1496 { 1497 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1498 second one *list and do list++ afterwards */ 1499 if ((PCTFS_my_id|mask)==(*list|mask)) 1500 { 1501 ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1502 list++; msg_ids_in++;in1 += *size++; 1503 } 1504 else 1505 {list++; size++;} 1506 } 1507 while (*++msg_nodes); 1508 1509 /* load gs values into in out gs buffers */ 1510 while (*iptr >= 0) 1511 {*dptr3++ = *(in_vals + *iptr++);} 1512 1513 /* load out buffers and post the sends */ 1514 msg_nodes=nodes; 1515 list = msg_list; 1516 while ((iptr = *msg_nodes++)) 1517 { 1518 if ((PCTFS_my_id|mask)==(*list|mask)) 1519 { 1520 dptr3 = dptr2; 1521 while (*iptr >= 0) 1522 {*dptr2++ = *(dptr1 + *iptr++);} 1523 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1524 /* is msg_ids_out++ correct? */ 1525 ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1526 msg_size++;list++;msg_ids_out++; 1527 } 1528 else 1529 {list++; msg_size++;} 1530 } 1531 1532 /* do the tree while we're waiting */ 1533 if (gs->max_left_over) 1534 {PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);} 1535 1536 /* process the received data */ 1537 msg_nodes=nodes; 1538 list = msg_list; 1539 while ((iptr = *nodes++)) 1540 { 1541 if ((PCTFS_my_id|mask)==(*list|mask)) 1542 { 1543 /* Should I check the return value of MPI_Wait() or status? */ 1544 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1545 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1546 ids_in++; 1547 while (*iptr >= 0) 1548 {*(dptr1 + *iptr++) += *in2++;} 1549 } 1550 list++; 1551 } 1552 1553 /* replace vals */ 1554 while (*pw >= 0) 1555 {*(in_vals + *pw++) = *dptr1++;} 1556 1557 /* clear isend message handles */ 1558 /* This changed for clarity though it could be the same */ 1559 while (*msg_nodes++) 1560 { 1561 if ((PCTFS_my_id|mask)==(*msg_list|mask)) 1562 { 1563 /* Should I check the return value of MPI_Wait() or status? */ 1564 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1565 ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr); 1566 ids_out++; 1567 } 1568 msg_list++; 1569 } 1570 1571 PetscFunctionReturn(0); 1572 } 1573 1574 /******************************************************************************/ 1575 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1576 { 1577 PetscInt size; 1578 PetscInt *in, *out; 1579 PetscScalar *buf, *work; 1580 PetscInt op[] = {GL_ADD,0}; 1581 1582 PetscFunctionBegin; 1583 in = gs->tree_map_in; 1584 out = gs->tree_map_out; 1585 buf = gs->tree_buf; 1586 work = gs->tree_work; 1587 size = gs->tree_nel; 1588 1589 PCTFS_rvec_zero(buf,size); 1590 1591 while (*in >= 0) 1592 {*(buf + *out++) = *(vals + *in++);} 1593 1594 in = gs->tree_map_in; 1595 out = gs->tree_map_out; 1596 1597 PCTFS_grop_hc(buf,work,size,op,dim); 1598 1599 while (*in >= 0) 1600 {*(vals + *in++) = *(buf + *out++);} 1601 PetscFunctionReturn(0); 1602 } 1603 1604 1605 1606