1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 30 31 /***********************************gs.c*************************************** 32 Type: struct gather_scatter_id 33 ------------------------------ 34 35 ************************************gs.c**************************************/ 36 typedef struct gather_scatter_id { 37 PetscInt id; 38 PetscInt nel_min; 39 PetscInt nel_max; 40 PetscInt nel_sum; 41 PetscInt negl; 42 PetscInt gl_max; 43 PetscInt gl_min; 44 PetscInt repeats; 45 PetscInt ordered; 46 PetscInt positive; 47 PetscScalar *vals; 48 49 /* bit mask info */ 50 PetscInt *my_proc_mask; 51 PetscInt mask_sz; 52 PetscInt *ngh_buf; 53 PetscInt ngh_buf_sz; 54 PetscInt *nghs; 55 PetscInt num_nghs; 56 PetscInt max_nghs; 57 PetscInt *pw_nghs; 58 PetscInt num_pw_nghs; 59 PetscInt *tree_nghs; 60 PetscInt num_tree_nghs; 61 62 PetscInt num_loads; 63 64 /* repeats == true -> local info */ 65 PetscInt nel; /* number of unique elememts */ 66 PetscInt *elms; /* of size nel */ 67 PetscInt nel_total; 68 PetscInt *local_elms; /* of size nel_total */ 69 PetscInt *companion; /* of size nel_total */ 70 71 /* local info */ 72 PetscInt num_local_total; 73 PetscInt local_strength; 74 PetscInt num_local; 75 PetscInt *num_local_reduce; 76 PetscInt **local_reduce; 77 PetscInt num_local_gop; 78 PetscInt *num_gop_local_reduce; 79 PetscInt **gop_local_reduce; 80 81 /* pairwise info */ 82 PetscInt level; 83 PetscInt num_pairs; 84 PetscInt max_pairs; 85 PetscInt loc_node_pairs; 86 PetscInt max_node_pairs; 87 PetscInt min_node_pairs; 88 PetscInt avg_node_pairs; 89 PetscInt *pair_list; 90 PetscInt *msg_sizes; 91 PetscInt **node_list; 92 PetscInt len_pw_list; 93 PetscInt *pw_elm_list; 94 PetscScalar *pw_vals; 95 96 MPI_Request *msg_ids_in; 97 MPI_Request *msg_ids_out; 98 99 PetscScalar *out; 100 PetscScalar *in; 101 PetscInt msg_total; 102 103 /* tree - crystal accumulator info */ 104 PetscInt max_left_over; 105 PetscInt *pre; 106 PetscInt *in_num; 107 PetscInt *out_num; 108 PetscInt **in_list; 109 PetscInt **out_list; 110 111 /* new tree work*/ 112 PetscInt tree_nel; 113 PetscInt *tree_elms; 114 PetscScalar *tree_buf; 115 PetscScalar *tree_work; 116 117 PetscInt tree_map_sz; 118 PetscInt *tree_map_in; 119 PetscInt *tree_map_out; 120 121 /* current memory status */ 122 PetscInt gl_bss_min; 123 PetscInt gl_perm_min; 124 125 /* max segment size for PCTFS_gs_gop_vec() */ 126 PetscInt vec_sz; 127 128 /* hack to make paul happy */ 129 MPI_Comm PCTFS_gs_comm; 130 131 } PCTFS_gs_id; 132 133 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 134 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 135 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 136 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 137 static PCTFS_gs_id *gsi_new(void); 138 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 139 140 /* same for all but vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 142 /* vector flavor */ 143 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 144 145 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 149 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 150 151 152 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 153 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 154 155 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 156 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 157 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 158 159 /* global vars */ 160 /* from comm.c module */ 161 162 static PetscInt num_gs_ids = 0; 163 164 /* should make this dynamic ... later */ 165 static PetscInt msg_buf =MAX_MSG_BUF; 166 static PetscInt vec_sz =GS_VEC_SZ; 167 static PetscInt *tree_buf =NULL; 168 static PetscInt tree_buf_sz=0; 169 static PetscInt ntree =0; 170 171 /***************************************************************************/ 172 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 173 { 174 PetscFunctionBegin; 175 vec_sz = size; 176 PetscFunctionReturn(0); 177 } 178 179 /******************************************************************************/ 180 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 181 { 182 PetscFunctionBegin; 183 msg_buf = buf_size; 184 PetscFunctionReturn(0); 185 } 186 187 /******************************************************************************/ 188 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 189 { 190 PCTFS_gs_id *gs; 191 MPI_Group PCTFS_gs_group; 192 MPI_Comm PCTFS_gs_comm; 193 PetscErrorCode ierr; 194 195 PetscFunctionBegin; 196 /* ensure that communication package has been initialized */ 197 PCTFS_comm_init(); 198 199 200 /* determines if we have enough dynamic/semi-static memory */ 201 /* checks input, allocs and sets gd_id template */ 202 gs = gsi_check_args(elms,nel,level); 203 204 /* only bit mask version up and working for the moment */ 205 /* LATER :: get int list version working for sparse pblms */ 206 ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 207 208 209 ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 210 ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr); 211 212 gs->PCTFS_gs_comm=PCTFS_gs_comm; 213 214 return(gs); 215 } 216 217 /******************************************************************************/ 218 static PCTFS_gs_id *gsi_new(void) 219 { 220 PetscErrorCode ierr; 221 PCTFS_gs_id *gs; 222 gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id)); 223 ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr); 224 return(gs); 225 } 226 227 /******************************************************************************/ 228 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 229 { 230 PetscInt i, j, k, t2; 231 PetscInt *companion, *elms, *unique, *iptr; 232 PetscInt num_local=0, *num_to_reduce, **local_reduce; 233 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 234 PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; 235 PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; 236 PCTFS_gs_id *gs; 237 PetscErrorCode ierr; 238 239 240 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 241 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 242 243 if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 244 245 /* get space for gs template */ 246 gs = gsi_new(); 247 gs->id = ++num_gs_ids; 248 249 /* hmt 6.4.99 */ 250 /* caller can set global ids that don't participate to 0 */ 251 /* PCTFS_gs_init ignores all zeros in elm list */ 252 /* negative global ids are still invalid */ 253 for (i=j=0; i<nel; i++) { 254 if (in_elms[i]!=0) j++; 255 } 256 257 k=nel; nel=j; 258 259 /* copy over in_elms list and create inverse map */ 260 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 261 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 262 263 for (i=j=0; i<k; i++) { 264 if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; } 265 } 266 267 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 268 269 /* pre-pass ... check to see if sorted */ 270 elms[nel] = INT_MAX; 271 iptr = elms; 272 unique = elms+1; 273 j =0; 274 while (*iptr!=INT_MAX) { 275 if (*iptr++>*unique++) { j=1; break; } 276 } 277 278 /* set up inverse map */ 279 if (j) { 280 ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); 281 ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); 282 } else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 283 elms[nel] = INT_MIN; 284 285 /* first pass */ 286 /* determine number of unique elements, check pd */ 287 for (i=k=0; i<nel; i+=j) { 288 t2 = elms[i]; 289 j = ++i; 290 291 /* clump 'em for now */ 292 while (elms[j]==t2) j++; 293 294 /* how many together and num local */ 295 if (j-=i) { num_local++; k+=j; } 296 } 297 298 /* how many unique elements? */ 299 gs->repeats = k; 300 gs->nel = nel-k; 301 302 303 /* number of repeats? */ 304 gs->num_local = num_local; 305 num_local += 2; 306 gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*)); 307 gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 308 309 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 310 gs->elms = unique; 311 gs->nel_total = nel; 312 gs->local_elms = elms; 313 gs->companion = companion; 314 315 /* compess map as well as keep track of local ops */ 316 for (num_local=i=j=0; i<gs->nel; i++) { 317 k = j; 318 t2 = unique[i] = elms[j]; 319 companion[i] = companion[j]; 320 321 while (elms[j]==t2) j++; 322 323 if ((t2=(j-k))>1) { 324 /* number together */ 325 num_to_reduce[num_local] = t2++; 326 327 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 328 329 /* to use binary searching don't remap until we check intersection */ 330 *iptr++ = i; 331 332 /* note that we're skipping the first one */ 333 while (++k<j) *(iptr++) = companion[k]; 334 *iptr = -1; 335 } 336 } 337 338 /* sentinel for ngh_buf */ 339 unique[gs->nel]=INT_MAX; 340 341 /* for two partition sort hack */ 342 num_to_reduce[num_local] = 0; 343 local_reduce[num_local] = NULL; 344 num_to_reduce[++num_local] = 0; 345 local_reduce[num_local] = NULL; 346 347 /* load 'em up */ 348 /* note one extra to hold NON_UNIFORM flag!!! */ 349 vals[2] = vals[1] = vals[0] = nel; 350 if (gs->nel>0) { 351 vals[3] = unique[0]; 352 vals[4] = unique[gs->nel-1]; 353 } else { 354 vals[3] = INT_MAX; 355 vals[4] = INT_MIN; 356 } 357 vals[5] = level; 358 vals[6] = num_gs_ids; 359 360 /* GLOBAL: send 'em out */ 361 ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 362 363 /* must be semi-pos def - only pairwise depends on this */ 364 /* LATER - remove this restriction */ 365 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 366 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 367 368 gs->nel_min = vals[0]; 369 gs->nel_max = vals[1]; 370 gs->nel_sum = vals[2]; 371 gs->gl_min = vals[3]; 372 gs->gl_max = vals[4]; 373 gs->negl = vals[4]-vals[3]+1; 374 375 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 376 377 /* LATER :: add level == -1 -> program selects level */ 378 if (vals[5]<0) vals[5]=0; 379 else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes; 380 gs->level = vals[5]; 381 382 return(gs); 383 } 384 385 /******************************************************************************/ 386 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 387 { 388 PetscInt i, nel, *elms; 389 PetscInt t1; 390 PetscInt **reduce; 391 PetscInt *map; 392 PetscErrorCode ierr; 393 394 PetscFunctionBegin; 395 /* totally local removes ... PCTFS_ct_bits == 0 */ 396 get_ngh_buf(gs); 397 398 if (gs->level) set_pairwise(gs); 399 if (gs->max_left_over) set_tree(gs); 400 401 /* intersection local and pairwise/tree? */ 402 gs->num_local_total = gs->num_local; 403 gs->gop_local_reduce = gs->local_reduce; 404 gs->num_gop_local_reduce = gs->num_local_reduce; 405 406 map = gs->companion; 407 408 /* is there any local compression */ 409 if (!gs->num_local) { 410 gs->local_strength = NONE; 411 gs->num_local_gop = 0; 412 } else { 413 /* ok find intersection */ 414 map = gs->companion; 415 reduce = gs->local_reduce; 416 for (i=0, t1=0; i<gs->num_local; i++, reduce++) 417 { 418 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) 419 || 420 PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) { 421 t1++; 422 if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 423 gs->num_local_reduce[i] *= -1; 424 } 425 **reduce=map[**reduce]; 426 } 427 428 /* intersection is empty */ 429 if (!t1) { 430 gs->local_strength = FULL; 431 gs->num_local_gop = 0; 432 } else { /* intersection not empty */ 433 gs->local_strength = PARTIAL; 434 435 ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr); 436 437 gs->num_local_gop = t1; 438 gs->num_local_total = gs->num_local; 439 gs->num_local -= t1; 440 gs->gop_local_reduce = gs->local_reduce; 441 gs->num_gop_local_reduce = gs->num_local_reduce; 442 443 for (i=0; i<t1; i++) { 444 if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 445 gs->num_gop_local_reduce[i] *= -1; 446 gs->local_reduce++; 447 gs->num_local_reduce++; 448 } 449 gs->local_reduce++; 450 gs->num_local_reduce++; 451 } 452 } 453 454 elms = gs->pw_elm_list; 455 nel = gs->len_pw_list; 456 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 457 458 elms = gs->tree_map_in; 459 nel = gs->tree_map_sz; 460 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 461 462 /* clean up */ 463 free((void*) gs->local_elms); 464 free((void*) gs->companion); 465 free((void*) gs->elms); 466 free((void*) gs->ngh_buf); 467 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 468 PetscFunctionReturn(0); 469 } 470 471 /******************************************************************************/ 472 static PetscErrorCode place_in_tree(PetscInt elm) 473 { 474 PetscInt *tp, n; 475 476 PetscFunctionBegin; 477 if (ntree==tree_buf_sz) { 478 if (tree_buf_sz) { 479 tp = tree_buf; 480 n = tree_buf_sz; 481 tree_buf_sz<<=1; 482 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 483 PCTFS_ivec_copy(tree_buf,tp,n); 484 free(tp); 485 } else { 486 tree_buf_sz = TREE_BUF_SZ; 487 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 488 } 489 } 490 491 tree_buf[ntree++] = elm; 492 PetscFunctionReturn(0); 493 } 494 495 /******************************************************************************/ 496 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 497 { 498 PetscInt i, j, npw=0, ntree_map=0; 499 PetscInt p_mask_size, ngh_buf_size, buf_size; 500 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 501 PetscInt *ngh_buf, *buf1, *buf2; 502 PetscInt offset, per_load, num_loads, or_ct, start, end; 503 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 504 PetscInt oper=GL_B_OR; 505 PetscInt *ptr3, *t_mask, level, ct1, ct2; 506 PetscErrorCode ierr; 507 508 PetscFunctionBegin; 509 /* to make life easier */ 510 nel = gs->nel; 511 elms = gs->elms; 512 level = gs->level; 513 514 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 515 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 516 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 517 518 /* allocate space for masks and info bufs */ 519 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 520 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 521 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 522 t_mask = (PetscInt*) malloc(p_mask_size); 523 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 524 525 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 526 /* had thought I could exploit rendezvous threshold */ 527 528 /* default is one pass */ 529 per_load = negl = gs->negl; 530 gs->num_loads = num_loads = 1; 531 i = p_mask_size*negl; 532 533 /* possible overflow on buffer size */ 534 /* overflow hack */ 535 if (i<0) i=INT_MAX; 536 537 buf_size = PetscMin(msg_buf,i); 538 539 /* can we do it? */ 540 if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); 541 542 /* get PCTFS_giop buf space ... make *only* one malloc */ 543 buf1 = (PetscInt*) malloc(buf_size<<1); 544 545 /* more than one gior exchange needed? */ 546 if (buf_size!=i) { 547 per_load = buf_size/p_mask_size; 548 buf_size = per_load*p_mask_size; 549 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 550 } 551 552 553 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 554 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 555 556 /* find PCTFS_giop work space */ 557 buf2 = buf1+buf_size; 558 559 /* hold #ints needed for processor masks */ 560 gs->mask_sz=p_mask_size; 561 562 /* init buffers */ 563 ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); 564 ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); 565 ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); 566 567 /* HACK reset tree info */ 568 tree_buf = NULL; 569 tree_buf_sz = ntree = 0; 570 571 /* ok do it */ 572 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { 573 /* identity for bitwise or is 000...000 */ 574 PCTFS_ivec_zero(buf1,buf_size); 575 576 /* load msg buffer */ 577 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { 578 offset = (offset-start)*p_mask_size; 579 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 580 } 581 582 /* GLOBAL: pass buffer */ 583 ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); 584 585 586 /* unload buffer into ngh_buf */ 587 ptr2=(elms+i_start); 588 for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { 589 /* I own it ... may have to pairwise it */ 590 if (j==*ptr2) { 591 /* do i share it w/anyone? */ 592 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 593 /* guess not */ 594 if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; } 595 596 /* i do ... so keep info and turn off my bit */ 597 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 598 ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); 599 ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 600 601 /* is it to be done pairwise? */ 602 if (--ct1<=level) { 603 npw++; 604 605 /* turn on high bit to indicate pw need to process */ 606 *ptr2++ |= TOP_BIT; 607 ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 608 ptr1 += p_mask_size; 609 continue; 610 } 611 612 /* get set for next and note that I have a tree contribution */ 613 /* could save exact elm index for tree here -> save a search */ 614 ptr2++; ptr1+=p_mask_size; ntree_map++; 615 } else { /* i don't but still might be involved in tree */ 616 617 /* shared by how many? */ 618 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 619 620 /* none! */ 621 if (ct1<2) continue; 622 623 /* is it going to be done pairwise? but not by me of course!*/ 624 if (--ct1<=level) continue; 625 } 626 /* LATER we're going to have to process it NOW */ 627 /* nope ... tree it */ 628 ierr = place_in_tree(j);CHKERRQ(ierr); 629 } 630 } 631 632 free((void*)t_mask); 633 free((void*)buf1); 634 635 gs->len_pw_list = npw; 636 gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 637 638 /* expand from bit mask list to int list and save ngh list */ 639 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 640 PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 641 642 gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 643 644 oper = GL_MAX; 645 ct1 = gs->num_nghs; 646 ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); 647 gs->max_nghs = ct1; 648 649 gs->tree_map_sz = ntree_map; 650 gs->max_left_over=ntree; 651 652 free((void*)p_mask); 653 free((void*)sh_proc_mask); 654 PetscFunctionReturn(0); 655 } 656 657 /******************************************************************************/ 658 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 659 { 660 PetscInt i, j; 661 PetscInt p_mask_size; 662 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 663 PetscInt *ngh_buf, *buf2; 664 PetscInt offset; 665 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 666 PetscInt *pairwise_elm_list, len_pair_list=0; 667 PetscInt *iptr, t1, i_start, nel, *elms; 668 PetscInt ct; 669 PetscErrorCode ierr; 670 671 PetscFunctionBegin; 672 /* to make life easier */ 673 nel = gs->nel; 674 elms = gs->elms; 675 ngh_buf = gs->ngh_buf; 676 sh_proc_mask = gs->pw_nghs; 677 678 /* need a few temp masks */ 679 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 680 p_mask = (PetscInt*) malloc(p_mask_size); 681 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 682 683 /* set mask to my PCTFS_my_id's bit mask */ 684 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 685 686 p_mask_size /= sizeof(PetscInt); 687 688 len_pair_list = gs->len_pw_list; 689 gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 690 691 /* how many processors (nghs) do we have to exchange with? */ 692 nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 693 694 695 /* allocate space for PCTFS_gs_gop() info */ 696 gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 697 gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 698 gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1)); 699 700 /* init msg_size list */ 701 ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr); 702 703 /* expand from bit mask list to int list */ 704 ierr = PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); 705 706 /* keep list of elements being handled pairwise */ 707 for (i=j=0; i<nel; i++) { 708 if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; } 709 } 710 pairwise_elm_list[j] = -1; 711 712 gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 713 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 714 gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 715 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 716 gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 717 718 /* find who goes to each processor */ 719 for (i_start=i=0; i<nprs; i++) { 720 /* processor i's mask */ 721 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); 722 723 /* det # going to processor i */ 724 for (ct=j=0; j<len_pair_list; j++) { 725 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 726 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 727 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++; 728 } 729 msg_size[i] = ct; 730 i_start = PetscMax(i_start,ct); 731 732 /*space to hold nodes in message to first neighbor */ 733 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 734 735 for (j=0;j<len_pair_list;j++) { 736 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 737 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 738 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j; 739 } 740 *iptr = -1; 741 } 742 msg_nodes[nprs] = NULL; 743 744 j = gs->loc_node_pairs=i_start; 745 t1 = GL_MAX; 746 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 747 gs->max_node_pairs = i_start; 748 749 i_start = j; 750 t1 = GL_MIN; 751 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 752 gs->min_node_pairs = i_start; 753 754 i_start = j; 755 t1 = GL_ADD; 756 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 757 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 758 759 i_start = nprs; 760 t1 = GL_MAX; 761 PCTFS_giop(&i_start,&offset,1,&t1); 762 gs->max_pairs = i_start; 763 764 765 /* remap pairwise in tail of gsi_via_bit_mask() */ 766 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 767 gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 768 gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 769 770 /* reset malloc pool */ 771 free((void*)p_mask); 772 free((void*)tmp_proc_mask); 773 PetscFunctionReturn(0); 774 } 775 776 /* to do pruned tree just save ngh buf copy for each one and decode here! 777 ******************************************************************************/ 778 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 779 { 780 PetscInt i, j, n, nel; 781 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 782 783 PetscFunctionBegin; 784 /* local work ptrs */ 785 elms = gs->elms; 786 nel = gs->nel; 787 788 /* how many via tree */ 789 gs->tree_nel = n = ntree; 790 gs->tree_elms = tree_elms = iptr_in = tree_buf; 791 gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 792 gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 793 j = gs->tree_map_sz; 794 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 795 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 796 797 /* search the longer of the two lists */ 798 /* note ... could save this info in get_ngh_buf and save searches */ 799 if (n<=nel) { 800 /* bijective fct w/remap - search elm list */ 801 for (i=0; i<n; i++) { 802 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;} 803 } 804 } else { 805 for (i=0; i<nel; i++) { 806 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;} 807 } 808 } 809 810 /* sentinel */ 811 *iptr_in = *iptr_out = -1; 812 PetscFunctionReturn(0); 813 } 814 815 /******************************************************************************/ 816 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 817 { 818 PetscInt *num, *map, **reduce; 819 PetscScalar tmp; 820 821 PetscFunctionBegin; 822 num = gs->num_gop_local_reduce; 823 reduce = gs->gop_local_reduce; 824 while ((map = *reduce++)) { 825 /* wall */ 826 if (*num == 2) { 827 num++; 828 vals[map[1]] = vals[map[0]]; 829 } else if (*num == 3) { /* corner shared by three elements */ 830 num++; 831 vals[map[2]] = vals[map[1]] = vals[map[0]]; 832 } else if (*num == 4) { /* corner shared by four elements */ 833 num++; 834 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 835 } else { /* general case ... odd geoms ... 3D*/ 836 num++; 837 tmp = *(vals + *map++); 838 while (*map >= 0) *(vals + *map++) = tmp; 839 } 840 } 841 PetscFunctionReturn(0); 842 } 843 844 /******************************************************************************/ 845 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 846 { 847 PetscInt *num, *map, **reduce; 848 PetscScalar tmp; 849 850 PetscFunctionBegin; 851 num = gs->num_local_reduce; 852 reduce = gs->local_reduce; 853 while ((map = *reduce)) { 854 /* wall */ 855 if (*num == 2) { 856 num++; reduce++; 857 vals[map[1]] = vals[map[0]] += vals[map[1]]; 858 } else if (*num == 3) { /* corner shared by three elements */ 859 num++; reduce++; 860 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 861 } else if (*num == 4) { /* corner shared by four elements */ 862 num++; reduce++; 863 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 864 } else { /* general case ... odd geoms ... 3D*/ 865 num++; 866 tmp = 0.0; 867 while (*map >= 0) tmp += *(vals + *map++); 868 869 map = *reduce++; 870 while (*map >= 0) *(vals + *map++) = tmp; 871 } 872 } 873 PetscFunctionReturn(0); 874 } 875 876 /******************************************************************************/ 877 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 878 { 879 PetscInt *num, *map, **reduce; 880 PetscScalar *base; 881 882 PetscFunctionBegin; 883 num = gs->num_gop_local_reduce; 884 reduce = gs->gop_local_reduce; 885 while ((map = *reduce++)) { 886 /* wall */ 887 if (*num == 2) { 888 num++; 889 vals[map[0]] += vals[map[1]]; 890 } else if (*num == 3) { /* corner shared by three elements */ 891 num++; 892 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 893 } else if (*num == 4) { /* corner shared by four elements */ 894 num++; 895 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 896 } else { /* general case ... odd geoms ... 3D*/ 897 num++; 898 base = vals + *map++; 899 while (*map >= 0) *base += *(vals + *map++); 900 } 901 } 902 PetscFunctionReturn(0); 903 } 904 905 /******************************************************************************/ 906 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 907 { 908 PetscInt i; 909 910 PetscFunctionBegin; 911 if (gs->nghs) free((void*) gs->nghs); 912 if (gs->pw_nghs) free((void*) gs->pw_nghs); 913 914 /* tree */ 915 if (gs->max_left_over) { 916 if (gs->tree_elms) free((void*) gs->tree_elms); 917 if (gs->tree_buf) free((void*) gs->tree_buf); 918 if (gs->tree_work) free((void*) gs->tree_work); 919 if (gs->tree_map_in) free((void*) gs->tree_map_in); 920 if (gs->tree_map_out) free((void*) gs->tree_map_out); 921 } 922 923 /* pairwise info */ 924 if (gs->num_pairs) { 925 /* should be NULL already */ 926 if (gs->ngh_buf) free((void*) gs->ngh_buf); 927 if (gs->elms) free((void*) gs->elms); 928 if (gs->local_elms) free((void*) gs->local_elms); 929 if (gs->companion) free((void*) gs->companion); 930 931 /* only set if pairwise */ 932 if (gs->vals) free((void*) gs->vals); 933 if (gs->in) free((void*) gs->in); 934 if (gs->out) free((void*) gs->out); 935 if (gs->msg_ids_in) free((void*) gs->msg_ids_in); 936 if (gs->msg_ids_out) free((void*) gs->msg_ids_out); 937 if (gs->pw_vals) free((void*) gs->pw_vals); 938 if (gs->pw_elm_list) free((void*) gs->pw_elm_list); 939 if (gs->node_list) { 940 for (i=0;i<gs->num_pairs;i++) { 941 if (gs->node_list[i]) { 942 free((void*) gs->node_list[i]); 943 } 944 } 945 free((void*) gs->node_list); 946 } 947 if (gs->msg_sizes) free((void*) gs->msg_sizes); 948 if (gs->pair_list) free((void*) gs->pair_list); 949 } 950 951 /* local info */ 952 if (gs->num_local_total>=0) { 953 /* for (i=0;i<gs->num_local_total;i++) */ 954 for (i=0;i<gs->num_local_total+1;i++) { 955 if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]); 956 } 957 } 958 959 /* if intersection tree/pairwise and local isn't empty */ 960 if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce); 961 if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce); 962 963 free((void*) gs); 964 PetscFunctionReturn(0); 965 } 966 967 /******************************************************************************/ 968 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 969 { 970 PetscErrorCode ierr; 971 972 PetscFunctionBegin; 973 switch (*op) { 974 case '+': 975 PCTFS_gs_gop_vec_plus(gs,vals,step); 976 break; 977 default: 978 ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 979 ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");CHKERRQ(ierr); 980 PCTFS_gs_gop_vec_plus(gs,vals,step); 981 break; 982 } 983 PetscFunctionReturn(0); 984 } 985 986 /******************************************************************************/ 987 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 988 { 989 PetscFunctionBegin; 990 if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 991 992 /* local only operations!!! */ 993 if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step); 994 995 /* if intersection tree/pairwise and local isn't empty */ 996 if (gs->num_local_gop) { 997 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 998 999 /* pairwise */ 1000 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 1001 1002 /* tree */ 1003 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 1004 1005 PCTFS_gs_gop_vec_local_out(gs,vals,step); 1006 } else { /* if intersection tree/pairwise and local is empty */ 1007 /* pairwise */ 1008 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 1009 1010 /* tree */ 1011 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 1012 } 1013 PetscFunctionReturn(0); 1014 } 1015 1016 /******************************************************************************/ 1017 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1018 { 1019 PetscInt *num, *map, **reduce; 1020 PetscScalar *base; 1021 1022 PetscFunctionBegin; 1023 num = gs->num_local_reduce; 1024 reduce = gs->local_reduce; 1025 while ((map = *reduce)) { 1026 base = vals + map[0] * step; 1027 1028 /* wall */ 1029 if (*num == 2) { 1030 num++; reduce++; 1031 PCTFS_rvec_add (base,vals+map[1]*step,step); 1032 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1033 } else if (*num == 3) { /* corner shared by three elements */ 1034 num++; reduce++; 1035 PCTFS_rvec_add (base,vals+map[1]*step,step); 1036 PCTFS_rvec_add (base,vals+map[2]*step,step); 1037 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1038 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1039 } else if (*num == 4) { /* corner shared by four elements */ 1040 num++; reduce++; 1041 PCTFS_rvec_add (base,vals+map[1]*step,step); 1042 PCTFS_rvec_add (base,vals+map[2]*step,step); 1043 PCTFS_rvec_add (base,vals+map[3]*step,step); 1044 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1045 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1046 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1047 } else { /* general case ... odd geoms ... 3D */ 1048 num++; 1049 while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step); 1050 1051 map = *reduce; 1052 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1053 1054 reduce++; 1055 } 1056 } 1057 PetscFunctionReturn(0); 1058 } 1059 1060 /******************************************************************************/ 1061 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1062 { 1063 PetscInt *num, *map, **reduce; 1064 PetscScalar *base; 1065 1066 PetscFunctionBegin; 1067 num = gs->num_gop_local_reduce; 1068 reduce = gs->gop_local_reduce; 1069 while ((map = *reduce++)) { 1070 base = vals + map[0] * step; 1071 1072 /* wall */ 1073 if (*num == 2) { 1074 num++; 1075 PCTFS_rvec_add(base,vals+map[1]*step,step); 1076 } else if (*num == 3) { /* corner shared by three elements */ 1077 num++; 1078 PCTFS_rvec_add(base,vals+map[1]*step,step); 1079 PCTFS_rvec_add(base,vals+map[2]*step,step); 1080 } else if (*num == 4) { /* corner shared by four elements */ 1081 num++; 1082 PCTFS_rvec_add(base,vals+map[1]*step,step); 1083 PCTFS_rvec_add(base,vals+map[2]*step,step); 1084 PCTFS_rvec_add(base,vals+map[3]*step,step); 1085 } else { /* general case ... odd geoms ... 3D*/ 1086 num++; 1087 while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step); 1088 } 1089 } 1090 PetscFunctionReturn(0); 1091 } 1092 1093 /******************************************************************************/ 1094 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1095 { 1096 PetscInt *num, *map, **reduce; 1097 PetscScalar *base; 1098 1099 PetscFunctionBegin; 1100 num = gs->num_gop_local_reduce; 1101 reduce = gs->gop_local_reduce; 1102 while ((map = *reduce++)) { 1103 base = vals + map[0] * step; 1104 1105 /* wall */ 1106 if (*num == 2) { 1107 num++; 1108 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1109 } else if (*num == 3) { /* corner shared by three elements */ 1110 num++; 1111 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1112 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1113 } else if (*num == 4) { /* corner shared by four elements */ 1114 num++; 1115 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1116 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1117 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1118 } else { /* general case ... odd geoms ... 3D*/ 1119 num++; 1120 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1121 } 1122 } 1123 PetscFunctionReturn(0); 1124 } 1125 1126 /******************************************************************************/ 1127 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1128 { 1129 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1130 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1131 PetscInt *pw, *list, *size, **nodes; 1132 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1133 MPI_Status status; 1134 PetscBLASInt i1 = 1,dstep; 1135 PetscErrorCode ierr; 1136 1137 PetscFunctionBegin; 1138 /* strip and load s */ 1139 msg_list = list = gs->pair_list; 1140 msg_size = size = gs->msg_sizes; 1141 msg_nodes = nodes = gs->node_list; 1142 iptr = pw = gs->pw_elm_list; 1143 dptr1 = dptr3 = gs->pw_vals; 1144 msg_ids_in = ids_in = gs->msg_ids_in; 1145 msg_ids_out = ids_out = gs->msg_ids_out; 1146 dptr2 = gs->out; 1147 in1=in2 = gs->in; 1148 1149 /* post the receives */ 1150 /* msg_nodes=nodes; */ 1151 do { 1152 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1153 second one *list and do list++ afterwards */ 1154 ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1155 list++;msg_ids_in++; 1156 in1 += *size++ *step; 1157 } while (*++msg_nodes); 1158 msg_nodes=nodes; 1159 1160 /* load gs values into in out gs buffers */ 1161 while (*iptr >= 0) { 1162 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1163 dptr3+=step; 1164 iptr++; 1165 } 1166 1167 /* load out buffers and post the sends */ 1168 while ((iptr = *msg_nodes++)) { 1169 dptr3 = dptr2; 1170 while (*iptr >= 0) { 1171 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1172 dptr2+=step; 1173 iptr++; 1174 } 1175 ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1176 msg_size++; msg_list++;msg_ids_out++; 1177 } 1178 1179 /* tree */ 1180 if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); 1181 1182 /* process the received data */ 1183 msg_nodes=nodes; 1184 while ((iptr = *nodes++)) { 1185 PetscScalar d1 = 1.0; 1186 1187 /* Should I check the return value of MPI_Wait() or status? */ 1188 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1189 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1190 ids_in++; 1191 while (*iptr >= 0) { 1192 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1193 PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1)); 1194 in2+=step; 1195 iptr++; 1196 } 1197 } 1198 1199 /* replace vals */ 1200 while (*pw >= 0) { 1201 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1202 dptr1+=step; 1203 pw++; 1204 } 1205 1206 /* clear isend message handles */ 1207 /* This changed for clarity though it could be the same */ 1208 1209 /* Should I check the return value of MPI_Wait() or status? */ 1210 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1211 while (*msg_nodes++) { 1212 ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr); 1213 ids_out++; 1214 } 1215 PetscFunctionReturn(0); 1216 } 1217 1218 /******************************************************************************/ 1219 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1220 { 1221 PetscInt size, *in, *out; 1222 PetscScalar *buf, *work; 1223 PetscInt op[] = {GL_ADD,0}; 1224 PetscBLASInt i1 = 1; 1225 PetscErrorCode ierr; 1226 PetscBLASInt dstep; 1227 1228 PetscFunctionBegin; 1229 /* copy over to local variables */ 1230 in = gs->tree_map_in; 1231 out = gs->tree_map_out; 1232 buf = gs->tree_buf; 1233 work = gs->tree_work; 1234 size = gs->tree_nel*step; 1235 1236 /* zero out collection buffer */ 1237 PCTFS_rvec_zero(buf,size); 1238 1239 1240 /* copy over my contributions */ 1241 while (*in >= 0) { 1242 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1243 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1)); 1244 } 1245 1246 /* perform fan in/out on full buffer */ 1247 /* must change PCTFS_grop to handle the blas */ 1248 PCTFS_grop(buf,work,size,op); 1249 1250 /* reset */ 1251 in = gs->tree_map_in; 1252 out = gs->tree_map_out; 1253 1254 /* get the portion of the results I need */ 1255 while (*in >= 0) { 1256 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1257 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1)); 1258 } 1259 PetscFunctionReturn(0); 1260 } 1261 1262 /******************************************************************************/ 1263 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1264 { 1265 PetscErrorCode ierr; 1266 1267 PetscFunctionBegin; 1268 switch (*op) { 1269 case '+': 1270 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1271 break; 1272 default: 1273 ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 1274 ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr); 1275 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1276 break; 1277 } 1278 PetscFunctionReturn(0); 1279 } 1280 1281 /******************************************************************************/ 1282 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1283 { 1284 PetscFunctionBegin; 1285 /* if there's nothing to do return */ 1286 if (dim<=0) PetscFunctionReturn(0); 1287 1288 /* can't do more dimensions then exist */ 1289 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1290 1291 /* local only operations!!! */ 1292 if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals); 1293 1294 /* if intersection tree/pairwise and local isn't empty */ 1295 if (gs->num_local_gop) { 1296 PCTFS_gs_gop_local_in_plus(gs,vals); 1297 1298 /* pairwise will do tree inside ... */ 1299 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */ 1300 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1301 1302 PCTFS_gs_gop_local_out(gs,vals); 1303 } else { /* if intersection tree/pairwise and local is empty */ 1304 /* pairwise will do tree inside */ 1305 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */ 1306 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1307 } 1308 PetscFunctionReturn(0); 1309 } 1310 1311 /******************************************************************************/ 1312 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1313 { 1314 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1315 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1316 PetscInt *pw, *list, *size, **nodes; 1317 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1318 MPI_Status status; 1319 PetscInt i, mask=1; 1320 PetscErrorCode ierr; 1321 1322 PetscFunctionBegin; 1323 for (i=1; i<dim; i++) { mask<<=1; mask++; } 1324 1325 /* strip and load s */ 1326 msg_list = list = gs->pair_list; 1327 msg_size = size = gs->msg_sizes; 1328 msg_nodes = nodes = gs->node_list; 1329 iptr = pw = gs->pw_elm_list; 1330 dptr1 = dptr3 = gs->pw_vals; 1331 msg_ids_in = ids_in = gs->msg_ids_in; 1332 msg_ids_out = ids_out = gs->msg_ids_out; 1333 dptr2 = gs->out; 1334 in1 = in2 = gs->in; 1335 1336 /* post the receives */ 1337 /* msg_nodes=nodes; */ 1338 do { 1339 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1340 second one *list and do list++ afterwards */ 1341 if ((PCTFS_my_id|mask)==(*list|mask)) { 1342 ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRQ(ierr); 1343 list++; msg_ids_in++;in1 += *size++; 1344 } else { list++; size++; } 1345 } while (*++msg_nodes); 1346 1347 /* load gs values into in out gs buffers */ 1348 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1349 1350 /* load out buffers and post the sends */ 1351 msg_nodes=nodes; 1352 list = msg_list; 1353 while ((iptr = *msg_nodes++)) { 1354 if ((PCTFS_my_id|mask)==(*list|mask)) { 1355 dptr3 = dptr2; 1356 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1357 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1358 /* is msg_ids_out++ correct? */ 1359 ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRQ(ierr); 1360 msg_size++;list++;msg_ids_out++; 1361 } else {list++; msg_size++;} 1362 } 1363 1364 /* do the tree while we're waiting */ 1365 if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); 1366 1367 /* process the received data */ 1368 msg_nodes=nodes; 1369 list = msg_list; 1370 while ((iptr = *nodes++)) { 1371 if ((PCTFS_my_id|mask)==(*list|mask)) { 1372 /* Should I check the return value of MPI_Wait() or status? */ 1373 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1374 ierr = MPI_Wait(ids_in, &status);CHKERRQ(ierr); 1375 ids_in++; 1376 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1377 } 1378 list++; 1379 } 1380 1381 /* replace vals */ 1382 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1383 1384 /* clear isend message handles */ 1385 /* This changed for clarity though it could be the same */ 1386 while (*msg_nodes++) { 1387 if ((PCTFS_my_id|mask)==(*msg_list|mask)) { 1388 /* Should I check the return value of MPI_Wait() or status? */ 1389 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1390 ierr = MPI_Wait(ids_out, &status);CHKERRQ(ierr); 1391 ids_out++; 1392 } 1393 msg_list++; 1394 } 1395 PetscFunctionReturn(0); 1396 } 1397 1398 /******************************************************************************/ 1399 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1400 { 1401 PetscInt size; 1402 PetscInt *in, *out; 1403 PetscScalar *buf, *work; 1404 PetscInt op[] = {GL_ADD,0}; 1405 1406 PetscFunctionBegin; 1407 in = gs->tree_map_in; 1408 out = gs->tree_map_out; 1409 buf = gs->tree_buf; 1410 work = gs->tree_work; 1411 size = gs->tree_nel; 1412 1413 PCTFS_rvec_zero(buf,size); 1414 1415 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1416 1417 in = gs->tree_map_in; 1418 out = gs->tree_map_out; 1419 1420 PCTFS_grop_hc(buf,work,size,op,dim); 1421 1422 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1423 PetscFunctionReturn(0); 1424 } 1425 1426 1427 1428