1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 /***********************************gs.c*************************************** 30 Type: struct gather_scatter_id 31 ------------------------------ 32 33 ************************************gs.c**************************************/ 34 typedef struct gather_scatter_id { 35 PetscInt id; 36 PetscInt nel_min; 37 PetscInt nel_max; 38 PetscInt nel_sum; 39 PetscInt negl; 40 PetscInt gl_max; 41 PetscInt gl_min; 42 PetscInt repeats; 43 PetscInt ordered; 44 PetscInt positive; 45 PetscScalar *vals; 46 47 /* bit mask info */ 48 PetscInt *my_proc_mask; 49 PetscInt mask_sz; 50 PetscInt *ngh_buf; 51 PetscInt ngh_buf_sz; 52 PetscInt *nghs; 53 PetscInt num_nghs; 54 PetscInt max_nghs; 55 PetscInt *pw_nghs; 56 PetscInt num_pw_nghs; 57 PetscInt *tree_nghs; 58 PetscInt num_tree_nghs; 59 60 PetscInt num_loads; 61 62 /* repeats == true -> local info */ 63 PetscInt nel; /* number of unique elememts */ 64 PetscInt *elms; /* of size nel */ 65 PetscInt nel_total; 66 PetscInt *local_elms; /* of size nel_total */ 67 PetscInt *companion; /* of size nel_total */ 68 69 /* local info */ 70 PetscInt num_local_total; 71 PetscInt local_strength; 72 PetscInt num_local; 73 PetscInt *num_local_reduce; 74 PetscInt **local_reduce; 75 PetscInt num_local_gop; 76 PetscInt *num_gop_local_reduce; 77 PetscInt **gop_local_reduce; 78 79 /* pairwise info */ 80 PetscInt level; 81 PetscInt num_pairs; 82 PetscInt max_pairs; 83 PetscInt loc_node_pairs; 84 PetscInt max_node_pairs; 85 PetscInt min_node_pairs; 86 PetscInt avg_node_pairs; 87 PetscInt *pair_list; 88 PetscInt *msg_sizes; 89 PetscInt **node_list; 90 PetscInt len_pw_list; 91 PetscInt *pw_elm_list; 92 PetscScalar *pw_vals; 93 94 MPI_Request *msg_ids_in; 95 MPI_Request *msg_ids_out; 96 97 PetscScalar *out; 98 PetscScalar *in; 99 PetscInt msg_total; 100 101 /* tree - crystal accumulator info */ 102 PetscInt max_left_over; 103 PetscInt *pre; 104 PetscInt *in_num; 105 PetscInt *out_num; 106 PetscInt **in_list; 107 PetscInt **out_list; 108 109 /* new tree work*/ 110 PetscInt tree_nel; 111 PetscInt *tree_elms; 112 PetscScalar *tree_buf; 113 PetscScalar *tree_work; 114 115 PetscInt tree_map_sz; 116 PetscInt *tree_map_in; 117 PetscInt *tree_map_out; 118 119 /* current memory status */ 120 PetscInt gl_bss_min; 121 PetscInt gl_perm_min; 122 123 /* max segment size for PCTFS_gs_gop_vec() */ 124 PetscInt vec_sz; 125 126 /* hack to make paul happy */ 127 MPI_Comm PCTFS_gs_comm; 128 129 } PCTFS_gs_id; 130 131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 135 static PCTFS_gs_id *gsi_new(void); 136 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 137 138 /* same for all but vector flavor */ 139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 140 /* vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 151 152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 155 156 /* global vars */ 157 /* from comm.c module */ 158 159 static PetscInt num_gs_ids = 0; 160 161 /* should make this dynamic ... later */ 162 static PetscInt msg_buf =MAX_MSG_BUF; 163 static PetscInt vec_sz =GS_VEC_SZ; 164 static PetscInt *tree_buf =NULL; 165 static PetscInt tree_buf_sz=0; 166 static PetscInt ntree =0; 167 168 /***************************************************************************/ 169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 170 { 171 PetscFunctionBegin; 172 vec_sz = size; 173 PetscFunctionReturn(0); 174 } 175 176 /******************************************************************************/ 177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 178 { 179 PetscFunctionBegin; 180 msg_buf = buf_size; 181 PetscFunctionReturn(0); 182 } 183 184 /******************************************************************************/ 185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 186 { 187 PCTFS_gs_id *gs; 188 MPI_Group PCTFS_gs_group; 189 MPI_Comm PCTFS_gs_comm; 190 PetscErrorCode ierr; 191 192 PetscFunctionBegin; 193 /* ensure that communication package has been initialized */ 194 PCTFS_comm_init(); 195 196 /* determines if we have enough dynamic/semi-static memory */ 197 /* checks input, allocs and sets gd_id template */ 198 gs = gsi_check_args(elms,nel,level); 199 200 /* only bit mask version up and working for the moment */ 201 /* LATER :: get int list version working for sparse pblms */ 202 ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 203 204 ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 205 ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr); 206 ierr = MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 207 208 gs->PCTFS_gs_comm=PCTFS_gs_comm; 209 210 return(gs); 211 } 212 213 /******************************************************************************/ 214 static PCTFS_gs_id *gsi_new(void) 215 { 216 PetscErrorCode ierr; 217 PCTFS_gs_id *gs; 218 gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id)); 219 ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr); 220 return(gs); 221 } 222 223 /******************************************************************************/ 224 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 225 { 226 PetscInt i, j, k, t2; 227 PetscInt *companion, *elms, *unique, *iptr; 228 PetscInt num_local=0, *num_to_reduce, **local_reduce; 229 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 230 PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; 231 PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; 232 PCTFS_gs_id *gs; 233 PetscErrorCode ierr; 234 235 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 236 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 237 238 if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 239 240 /* get space for gs template */ 241 gs = gsi_new(); 242 gs->id = ++num_gs_ids; 243 244 /* hmt 6.4.99 */ 245 /* caller can set global ids that don't participate to 0 */ 246 /* PCTFS_gs_init ignores all zeros in elm list */ 247 /* negative global ids are still invalid */ 248 for (i=j=0; i<nel; i++) { 249 if (in_elms[i]!=0) j++; 250 } 251 252 k=nel; nel=j; 253 254 /* copy over in_elms list and create inverse map */ 255 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 256 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 257 258 for (i=j=0; i<k; i++) { 259 if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; } 260 } 261 262 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 263 264 /* pre-pass ... check to see if sorted */ 265 elms[nel] = INT_MAX; 266 iptr = elms; 267 unique = elms+1; 268 j =0; 269 while (*iptr!=INT_MAX) { 270 if (*iptr++>*unique++) { j=1; break; } 271 } 272 273 /* set up inverse map */ 274 if (j) { 275 ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); 276 ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); 277 } else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 278 elms[nel] = INT_MIN; 279 280 /* first pass */ 281 /* determine number of unique elements, check pd */ 282 for (i=k=0; i<nel; i+=j) { 283 t2 = elms[i]; 284 j = ++i; 285 286 /* clump 'em for now */ 287 while (elms[j]==t2) j++; 288 289 /* how many together and num local */ 290 if (j-=i) { num_local++; k+=j; } 291 } 292 293 /* how many unique elements? */ 294 gs->repeats = k; 295 gs->nel = nel-k; 296 297 /* number of repeats? */ 298 gs->num_local = num_local; 299 num_local += 2; 300 gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*)); 301 gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 302 303 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 304 gs->elms = unique; 305 gs->nel_total = nel; 306 gs->local_elms = elms; 307 gs->companion = companion; 308 309 /* compess map as well as keep track of local ops */ 310 for (num_local=i=j=0; i<gs->nel; i++) { 311 k = j; 312 t2 = unique[i] = elms[j]; 313 companion[i] = companion[j]; 314 315 while (elms[j]==t2) j++; 316 317 if ((t2=(j-k))>1) { 318 /* number together */ 319 num_to_reduce[num_local] = t2++; 320 321 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 322 323 /* to use binary searching don't remap until we check intersection */ 324 *iptr++ = i; 325 326 /* note that we're skipping the first one */ 327 while (++k<j) *(iptr++) = companion[k]; 328 *iptr = -1; 329 } 330 } 331 332 /* sentinel for ngh_buf */ 333 unique[gs->nel]=INT_MAX; 334 335 /* for two partition sort hack */ 336 num_to_reduce[num_local] = 0; 337 local_reduce[num_local] = NULL; 338 num_to_reduce[++num_local] = 0; 339 local_reduce[num_local] = NULL; 340 341 /* load 'em up */ 342 /* note one extra to hold NON_UNIFORM flag!!! */ 343 vals[2] = vals[1] = vals[0] = nel; 344 if (gs->nel>0) { 345 vals[3] = unique[0]; 346 vals[4] = unique[gs->nel-1]; 347 } else { 348 vals[3] = INT_MAX; 349 vals[4] = INT_MIN; 350 } 351 vals[5] = level; 352 vals[6] = num_gs_ids; 353 354 /* GLOBAL: send 'em out */ 355 ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 356 357 /* must be semi-pos def - only pairwise depends on this */ 358 /* LATER - remove this restriction */ 359 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 360 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 361 362 gs->nel_min = vals[0]; 363 gs->nel_max = vals[1]; 364 gs->nel_sum = vals[2]; 365 gs->gl_min = vals[3]; 366 gs->gl_max = vals[4]; 367 gs->negl = vals[4]-vals[3]+1; 368 369 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 370 371 /* LATER :: add level == -1 -> program selects level */ 372 if (vals[5]<0) vals[5]=0; 373 else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes; 374 gs->level = vals[5]; 375 376 return(gs); 377 } 378 379 /******************************************************************************/ 380 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 381 { 382 PetscInt i, nel, *elms; 383 PetscInt t1; 384 PetscInt **reduce; 385 PetscInt *map; 386 PetscErrorCode ierr; 387 388 PetscFunctionBegin; 389 /* totally local removes ... PCTFS_ct_bits == 0 */ 390 get_ngh_buf(gs); 391 392 if (gs->level) set_pairwise(gs); 393 if (gs->max_left_over) set_tree(gs); 394 395 /* intersection local and pairwise/tree? */ 396 gs->num_local_total = gs->num_local; 397 gs->gop_local_reduce = gs->local_reduce; 398 gs->num_gop_local_reduce = gs->num_local_reduce; 399 400 map = gs->companion; 401 402 /* is there any local compression */ 403 if (!gs->num_local) { 404 gs->local_strength = NONE; 405 gs->num_local_gop = 0; 406 } else { 407 /* ok find intersection */ 408 map = gs->companion; 409 reduce = gs->local_reduce; 410 for (i=0, t1=0; i<gs->num_local; i++, reduce++) { 411 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) { 412 t1++; 413 if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 414 gs->num_local_reduce[i] *= -1; 415 } 416 **reduce=map[**reduce]; 417 } 418 419 /* intersection is empty */ 420 if (!t1) { 421 gs->local_strength = FULL; 422 gs->num_local_gop = 0; 423 } else { /* intersection not empty */ 424 gs->local_strength = PARTIAL; 425 426 ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr); 427 428 gs->num_local_gop = t1; 429 gs->num_local_total = gs->num_local; 430 gs->num_local -= t1; 431 gs->gop_local_reduce = gs->local_reduce; 432 gs->num_gop_local_reduce = gs->num_local_reduce; 433 434 for (i=0; i<t1; i++) { 435 if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 436 gs->num_gop_local_reduce[i] *= -1; 437 gs->local_reduce++; 438 gs->num_local_reduce++; 439 } 440 gs->local_reduce++; 441 gs->num_local_reduce++; 442 } 443 } 444 445 elms = gs->pw_elm_list; 446 nel = gs->len_pw_list; 447 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 448 449 elms = gs->tree_map_in; 450 nel = gs->tree_map_sz; 451 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 452 453 /* clean up */ 454 free((void*) gs->local_elms); 455 free((void*) gs->companion); 456 free((void*) gs->elms); 457 free((void*) gs->ngh_buf); 458 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 459 PetscFunctionReturn(0); 460 } 461 462 /******************************************************************************/ 463 static PetscErrorCode place_in_tree(PetscInt elm) 464 { 465 PetscInt *tp, n; 466 467 PetscFunctionBegin; 468 if (ntree==tree_buf_sz) { 469 if (tree_buf_sz) { 470 tp = tree_buf; 471 n = tree_buf_sz; 472 tree_buf_sz<<=1; 473 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 474 PCTFS_ivec_copy(tree_buf,tp,n); 475 free(tp); 476 } else { 477 tree_buf_sz = TREE_BUF_SZ; 478 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 479 } 480 } 481 482 tree_buf[ntree++] = elm; 483 PetscFunctionReturn(0); 484 } 485 486 /******************************************************************************/ 487 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 488 { 489 PetscInt i, j, npw=0, ntree_map=0; 490 PetscInt p_mask_size, ngh_buf_size, buf_size; 491 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 492 PetscInt *ngh_buf, *buf1, *buf2; 493 PetscInt offset, per_load, num_loads, or_ct, start, end; 494 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 495 PetscInt oper=GL_B_OR; 496 PetscInt *ptr3, *t_mask, level, ct1, ct2; 497 PetscErrorCode ierr; 498 499 PetscFunctionBegin; 500 /* to make life easier */ 501 nel = gs->nel; 502 elms = gs->elms; 503 level = gs->level; 504 505 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 506 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 507 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 508 509 /* allocate space for masks and info bufs */ 510 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 511 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 512 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 513 t_mask = (PetscInt*) malloc(p_mask_size); 514 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 515 516 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 517 /* had thought I could exploit rendezvous threshold */ 518 519 /* default is one pass */ 520 per_load = negl = gs->negl; 521 gs->num_loads = num_loads = 1; 522 i = p_mask_size*negl; 523 524 /* possible overflow on buffer size */ 525 /* overflow hack */ 526 if (i<0) i=INT_MAX; 527 528 buf_size = PetscMin(msg_buf,i); 529 530 /* can we do it? */ 531 if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); 532 533 /* get PCTFS_giop buf space ... make *only* one malloc */ 534 buf1 = (PetscInt*) malloc(buf_size<<1); 535 536 /* more than one gior exchange needed? */ 537 if (buf_size!=i) { 538 per_load = buf_size/p_mask_size; 539 buf_size = per_load*p_mask_size; 540 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 541 } 542 543 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 544 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 545 546 /* find PCTFS_giop work space */ 547 buf2 = buf1+buf_size; 548 549 /* hold #ints needed for processor masks */ 550 gs->mask_sz=p_mask_size; 551 552 /* init buffers */ 553 ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); 554 ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); 555 ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); 556 557 /* HACK reset tree info */ 558 tree_buf = NULL; 559 tree_buf_sz = ntree = 0; 560 561 /* ok do it */ 562 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { 563 /* identity for bitwise or is 000...000 */ 564 PCTFS_ivec_zero(buf1,buf_size); 565 566 /* load msg buffer */ 567 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { 568 offset = (offset-start)*p_mask_size; 569 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 570 } 571 572 /* GLOBAL: pass buffer */ 573 ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); 574 575 /* unload buffer into ngh_buf */ 576 ptr2=(elms+i_start); 577 for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { 578 /* I own it ... may have to pairwise it */ 579 if (j==*ptr2) { 580 /* do i share it w/anyone? */ 581 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 582 /* guess not */ 583 if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; } 584 585 /* i do ... so keep info and turn off my bit */ 586 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 587 ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); 588 ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 589 590 /* is it to be done pairwise? */ 591 if (--ct1<=level) { 592 npw++; 593 594 /* turn on high bit to indicate pw need to process */ 595 *ptr2++ |= TOP_BIT; 596 ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 597 ptr1 += p_mask_size; 598 continue; 599 } 600 601 /* get set for next and note that I have a tree contribution */ 602 /* could save exact elm index for tree here -> save a search */ 603 ptr2++; ptr1+=p_mask_size; ntree_map++; 604 } else { /* i don't but still might be involved in tree */ 605 606 /* shared by how many? */ 607 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 608 609 /* none! */ 610 if (ct1<2) continue; 611 612 /* is it going to be done pairwise? but not by me of course!*/ 613 if (--ct1<=level) continue; 614 } 615 /* LATER we're going to have to process it NOW */ 616 /* nope ... tree it */ 617 ierr = place_in_tree(j);CHKERRQ(ierr); 618 } 619 } 620 621 free((void*)t_mask); 622 free((void*)buf1); 623 624 gs->len_pw_list = npw; 625 gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 626 627 /* expand from bit mask list to int list and save ngh list */ 628 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 629 PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 630 631 gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 632 633 oper = GL_MAX; 634 ct1 = gs->num_nghs; 635 ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); 636 gs->max_nghs = ct1; 637 638 gs->tree_map_sz = ntree_map; 639 gs->max_left_over=ntree; 640 641 free((void*)p_mask); 642 free((void*)sh_proc_mask); 643 PetscFunctionReturn(0); 644 } 645 646 /******************************************************************************/ 647 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 648 { 649 PetscInt i, j; 650 PetscInt p_mask_size; 651 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 652 PetscInt *ngh_buf, *buf2; 653 PetscInt offset; 654 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 655 PetscInt *pairwise_elm_list, len_pair_list=0; 656 PetscInt *iptr, t1, i_start, nel, *elms; 657 PetscInt ct; 658 PetscErrorCode ierr; 659 660 PetscFunctionBegin; 661 /* to make life easier */ 662 nel = gs->nel; 663 elms = gs->elms; 664 ngh_buf = gs->ngh_buf; 665 sh_proc_mask = gs->pw_nghs; 666 667 /* need a few temp masks */ 668 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 669 p_mask = (PetscInt*) malloc(p_mask_size); 670 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 671 672 /* set mask to my PCTFS_my_id's bit mask */ 673 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 674 675 p_mask_size /= sizeof(PetscInt); 676 677 len_pair_list = gs->len_pw_list; 678 gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 679 680 /* how many processors (nghs) do we have to exchange with? */ 681 nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 682 683 /* allocate space for PCTFS_gs_gop() info */ 684 gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 685 gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 686 gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1)); 687 688 /* init msg_size list */ 689 ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr); 690 691 /* expand from bit mask list to int list */ 692 ierr = PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); 693 694 /* keep list of elements being handled pairwise */ 695 for (i=j=0; i<nel; i++) { 696 if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; } 697 } 698 pairwise_elm_list[j] = -1; 699 700 gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 701 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 702 gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 703 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 704 gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 705 706 /* find who goes to each processor */ 707 for (i_start=i=0; i<nprs; i++) { 708 /* processor i's mask */ 709 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); 710 711 /* det # going to processor i */ 712 for (ct=j=0; j<len_pair_list; j++) { 713 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 714 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 715 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++; 716 } 717 msg_size[i] = ct; 718 i_start = PetscMax(i_start,ct); 719 720 /*space to hold nodes in message to first neighbor */ 721 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 722 723 for (j=0;j<len_pair_list;j++) { 724 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 725 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 726 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j; 727 } 728 *iptr = -1; 729 } 730 msg_nodes[nprs] = NULL; 731 732 j = gs->loc_node_pairs=i_start; 733 t1 = GL_MAX; 734 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 735 gs->max_node_pairs = i_start; 736 737 i_start = j; 738 t1 = GL_MIN; 739 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 740 gs->min_node_pairs = i_start; 741 742 i_start = j; 743 t1 = GL_ADD; 744 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 745 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 746 747 i_start = nprs; 748 t1 = GL_MAX; 749 PCTFS_giop(&i_start,&offset,1,&t1); 750 gs->max_pairs = i_start; 751 752 /* remap pairwise in tail of gsi_via_bit_mask() */ 753 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 754 gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 755 gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 756 757 /* reset malloc pool */ 758 free((void*)p_mask); 759 free((void*)tmp_proc_mask); 760 PetscFunctionReturn(0); 761 } 762 763 /* to do pruned tree just save ngh buf copy for each one and decode here! 764 ******************************************************************************/ 765 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 766 { 767 PetscInt i, j, n, nel; 768 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 769 770 PetscFunctionBegin; 771 /* local work ptrs */ 772 elms = gs->elms; 773 nel = gs->nel; 774 775 /* how many via tree */ 776 gs->tree_nel = n = ntree; 777 gs->tree_elms = tree_elms = iptr_in = tree_buf; 778 gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 779 gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 780 j = gs->tree_map_sz; 781 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 782 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 783 784 /* search the longer of the two lists */ 785 /* note ... could save this info in get_ngh_buf and save searches */ 786 if (n<=nel) { 787 /* bijective fct w/remap - search elm list */ 788 for (i=0; i<n; i++) { 789 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;} 790 } 791 } else { 792 for (i=0; i<nel; i++) { 793 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;} 794 } 795 } 796 797 /* sentinel */ 798 *iptr_in = *iptr_out = -1; 799 PetscFunctionReturn(0); 800 } 801 802 /******************************************************************************/ 803 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 804 { 805 PetscInt *num, *map, **reduce; 806 PetscScalar tmp; 807 808 PetscFunctionBegin; 809 num = gs->num_gop_local_reduce; 810 reduce = gs->gop_local_reduce; 811 while ((map = *reduce++)) { 812 /* wall */ 813 if (*num == 2) { 814 num++; 815 vals[map[1]] = vals[map[0]]; 816 } else if (*num == 3) { /* corner shared by three elements */ 817 num++; 818 vals[map[2]] = vals[map[1]] = vals[map[0]]; 819 } else if (*num == 4) { /* corner shared by four elements */ 820 num++; 821 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 822 } else { /* general case ... odd geoms ... 3D*/ 823 num++; 824 tmp = *(vals + *map++); 825 while (*map >= 0) *(vals + *map++) = tmp; 826 } 827 } 828 PetscFunctionReturn(0); 829 } 830 831 /******************************************************************************/ 832 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 833 { 834 PetscInt *num, *map, **reduce; 835 PetscScalar tmp; 836 837 PetscFunctionBegin; 838 num = gs->num_local_reduce; 839 reduce = gs->local_reduce; 840 while ((map = *reduce)) { 841 /* wall */ 842 if (*num == 2) { 843 num++; reduce++; 844 vals[map[1]] = vals[map[0]] += vals[map[1]]; 845 } else if (*num == 3) { /* corner shared by three elements */ 846 num++; reduce++; 847 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 848 } else if (*num == 4) { /* corner shared by four elements */ 849 num++; reduce++; 850 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 851 } else { /* general case ... odd geoms ... 3D*/ 852 num++; 853 tmp = 0.0; 854 while (*map >= 0) tmp += *(vals + *map++); 855 856 map = *reduce++; 857 while (*map >= 0) *(vals + *map++) = tmp; 858 } 859 } 860 PetscFunctionReturn(0); 861 } 862 863 /******************************************************************************/ 864 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 865 { 866 PetscInt *num, *map, **reduce; 867 PetscScalar *base; 868 869 PetscFunctionBegin; 870 num = gs->num_gop_local_reduce; 871 reduce = gs->gop_local_reduce; 872 while ((map = *reduce++)) { 873 /* wall */ 874 if (*num == 2) { 875 num++; 876 vals[map[0]] += vals[map[1]]; 877 } else if (*num == 3) { /* corner shared by three elements */ 878 num++; 879 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 880 } else if (*num == 4) { /* corner shared by four elements */ 881 num++; 882 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 883 } else { /* general case ... odd geoms ... 3D*/ 884 num++; 885 base = vals + *map++; 886 while (*map >= 0) *base += *(vals + *map++); 887 } 888 } 889 PetscFunctionReturn(0); 890 } 891 892 /******************************************************************************/ 893 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 894 { 895 PetscInt i; 896 PetscErrorCode ierr; 897 898 PetscFunctionBegin; 899 ierr = MPI_Comm_free(&gs->PCTFS_gs_comm);CHKERRMPI(ierr); 900 if (gs->nghs) free((void*) gs->nghs); 901 if (gs->pw_nghs) free((void*) gs->pw_nghs); 902 903 /* tree */ 904 if (gs->max_left_over) { 905 if (gs->tree_elms) free((void*) gs->tree_elms); 906 if (gs->tree_buf) free((void*) gs->tree_buf); 907 if (gs->tree_work) free((void*) gs->tree_work); 908 if (gs->tree_map_in) free((void*) gs->tree_map_in); 909 if (gs->tree_map_out) free((void*) gs->tree_map_out); 910 } 911 912 /* pairwise info */ 913 if (gs->num_pairs) { 914 /* should be NULL already */ 915 if (gs->ngh_buf) free((void*) gs->ngh_buf); 916 if (gs->elms) free((void*) gs->elms); 917 if (gs->local_elms) free((void*) gs->local_elms); 918 if (gs->companion) free((void*) gs->companion); 919 920 /* only set if pairwise */ 921 if (gs->vals) free((void*) gs->vals); 922 if (gs->in) free((void*) gs->in); 923 if (gs->out) free((void*) gs->out); 924 if (gs->msg_ids_in) free((void*) gs->msg_ids_in); 925 if (gs->msg_ids_out) free((void*) gs->msg_ids_out); 926 if (gs->pw_vals) free((void*) gs->pw_vals); 927 if (gs->pw_elm_list) free((void*) gs->pw_elm_list); 928 if (gs->node_list) { 929 for (i=0;i<gs->num_pairs;i++) { 930 if (gs->node_list[i]) { 931 free((void*) gs->node_list[i]); 932 } 933 } 934 free((void*) gs->node_list); 935 } 936 if (gs->msg_sizes) free((void*) gs->msg_sizes); 937 if (gs->pair_list) free((void*) gs->pair_list); 938 } 939 940 /* local info */ 941 if (gs->num_local_total>=0) { 942 for (i=0;i<gs->num_local_total+1;i++) { 943 if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]); 944 } 945 } 946 947 /* if intersection tree/pairwise and local isn't empty */ 948 if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce); 949 if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce); 950 951 free((void*) gs); 952 PetscFunctionReturn(0); 953 } 954 955 /******************************************************************************/ 956 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 957 { 958 PetscErrorCode ierr; 959 960 PetscFunctionBegin; 961 switch (*op) { 962 case '+': 963 PCTFS_gs_gop_vec_plus(gs,vals,step); 964 break; 965 default: 966 ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 967 ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");CHKERRQ(ierr); 968 PCTFS_gs_gop_vec_plus(gs,vals,step); 969 break; 970 } 971 PetscFunctionReturn(0); 972 } 973 974 /******************************************************************************/ 975 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 976 { 977 PetscFunctionBegin; 978 if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 979 980 /* local only operations!!! */ 981 if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step); 982 983 /* if intersection tree/pairwise and local isn't empty */ 984 if (gs->num_local_gop) { 985 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 986 987 /* pairwise */ 988 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 989 990 /* tree */ 991 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 992 993 PCTFS_gs_gop_vec_local_out(gs,vals,step); 994 } else { /* if intersection tree/pairwise and local is empty */ 995 /* pairwise */ 996 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 997 998 /* tree */ 999 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 1000 } 1001 PetscFunctionReturn(0); 1002 } 1003 1004 /******************************************************************************/ 1005 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1006 { 1007 PetscInt *num, *map, **reduce; 1008 PetscScalar *base; 1009 1010 PetscFunctionBegin; 1011 num = gs->num_local_reduce; 1012 reduce = gs->local_reduce; 1013 while ((map = *reduce)) { 1014 base = vals + map[0] * step; 1015 1016 /* wall */ 1017 if (*num == 2) { 1018 num++; reduce++; 1019 PCTFS_rvec_add (base,vals+map[1]*step,step); 1020 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1021 } else if (*num == 3) { /* corner shared by three elements */ 1022 num++; reduce++; 1023 PCTFS_rvec_add (base,vals+map[1]*step,step); 1024 PCTFS_rvec_add (base,vals+map[2]*step,step); 1025 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1026 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1027 } else if (*num == 4) { /* corner shared by four elements */ 1028 num++; reduce++; 1029 PCTFS_rvec_add (base,vals+map[1]*step,step); 1030 PCTFS_rvec_add (base,vals+map[2]*step,step); 1031 PCTFS_rvec_add (base,vals+map[3]*step,step); 1032 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1033 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1034 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1035 } else { /* general case ... odd geoms ... 3D */ 1036 num++; 1037 while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step); 1038 1039 map = *reduce; 1040 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1041 1042 reduce++; 1043 } 1044 } 1045 PetscFunctionReturn(0); 1046 } 1047 1048 /******************************************************************************/ 1049 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1050 { 1051 PetscInt *num, *map, **reduce; 1052 PetscScalar *base; 1053 1054 PetscFunctionBegin; 1055 num = gs->num_gop_local_reduce; 1056 reduce = gs->gop_local_reduce; 1057 while ((map = *reduce++)) { 1058 base = vals + map[0] * step; 1059 1060 /* wall */ 1061 if (*num == 2) { 1062 num++; 1063 PCTFS_rvec_add(base,vals+map[1]*step,step); 1064 } else if (*num == 3) { /* corner shared by three elements */ 1065 num++; 1066 PCTFS_rvec_add(base,vals+map[1]*step,step); 1067 PCTFS_rvec_add(base,vals+map[2]*step,step); 1068 } else if (*num == 4) { /* corner shared by four elements */ 1069 num++; 1070 PCTFS_rvec_add(base,vals+map[1]*step,step); 1071 PCTFS_rvec_add(base,vals+map[2]*step,step); 1072 PCTFS_rvec_add(base,vals+map[3]*step,step); 1073 } else { /* general case ... odd geoms ... 3D*/ 1074 num++; 1075 while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step); 1076 } 1077 } 1078 PetscFunctionReturn(0); 1079 } 1080 1081 /******************************************************************************/ 1082 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1083 { 1084 PetscInt *num, *map, **reduce; 1085 PetscScalar *base; 1086 1087 PetscFunctionBegin; 1088 num = gs->num_gop_local_reduce; 1089 reduce = gs->gop_local_reduce; 1090 while ((map = *reduce++)) { 1091 base = vals + map[0] * step; 1092 1093 /* wall */ 1094 if (*num == 2) { 1095 num++; 1096 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1097 } else if (*num == 3) { /* corner shared by three elements */ 1098 num++; 1099 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1100 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1101 } else if (*num == 4) { /* corner shared by four elements */ 1102 num++; 1103 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1104 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1105 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1106 } else { /* general case ... odd geoms ... 3D*/ 1107 num++; 1108 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1109 } 1110 } 1111 PetscFunctionReturn(0); 1112 } 1113 1114 /******************************************************************************/ 1115 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1116 { 1117 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1118 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1119 PetscInt *pw, *list, *size, **nodes; 1120 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1121 MPI_Status status; 1122 PetscBLASInt i1 = 1,dstep; 1123 PetscErrorCode ierr; 1124 1125 PetscFunctionBegin; 1126 /* strip and load s */ 1127 msg_list = list = gs->pair_list; 1128 msg_size = size = gs->msg_sizes; 1129 msg_nodes = nodes = gs->node_list; 1130 iptr = pw = gs->pw_elm_list; 1131 dptr1 = dptr3 = gs->pw_vals; 1132 msg_ids_in = ids_in = gs->msg_ids_in; 1133 msg_ids_out = ids_out = gs->msg_ids_out; 1134 dptr2 = gs->out; 1135 in1=in2 = gs->in; 1136 1137 /* post the receives */ 1138 /* msg_nodes=nodes; */ 1139 do { 1140 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1141 second one *list and do list++ afterwards */ 1142 ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr); 1143 list++;msg_ids_in++; 1144 in1 += *size++ *step; 1145 } while (*++msg_nodes); 1146 msg_nodes=nodes; 1147 1148 /* load gs values into in out gs buffers */ 1149 while (*iptr >= 0) { 1150 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1151 dptr3+=step; 1152 iptr++; 1153 } 1154 1155 /* load out buffers and post the sends */ 1156 while ((iptr = *msg_nodes++)) { 1157 dptr3 = dptr2; 1158 while (*iptr >= 0) { 1159 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1160 dptr2+=step; 1161 iptr++; 1162 } 1163 ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr); 1164 msg_size++; msg_list++;msg_ids_out++; 1165 } 1166 1167 /* tree */ 1168 if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); 1169 1170 /* process the received data */ 1171 msg_nodes=nodes; 1172 while ((iptr = *nodes++)) { 1173 PetscScalar d1 = 1.0; 1174 1175 /* Should I check the return value of MPI_Wait() or status? */ 1176 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1177 ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr); 1178 ids_in++; 1179 while (*iptr >= 0) { 1180 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1181 PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1)); 1182 in2+=step; 1183 iptr++; 1184 } 1185 } 1186 1187 /* replace vals */ 1188 while (*pw >= 0) { 1189 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1190 dptr1+=step; 1191 pw++; 1192 } 1193 1194 /* clear isend message handles */ 1195 /* This changed for clarity though it could be the same */ 1196 1197 /* Should I check the return value of MPI_Wait() or status? */ 1198 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1199 while (*msg_nodes++) { 1200 ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr); 1201 ids_out++; 1202 } 1203 PetscFunctionReturn(0); 1204 } 1205 1206 /******************************************************************************/ 1207 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1208 { 1209 PetscInt size, *in, *out; 1210 PetscScalar *buf, *work; 1211 PetscInt op[] = {GL_ADD,0}; 1212 PetscBLASInt i1 = 1; 1213 PetscErrorCode ierr; 1214 PetscBLASInt dstep; 1215 1216 PetscFunctionBegin; 1217 /* copy over to local variables */ 1218 in = gs->tree_map_in; 1219 out = gs->tree_map_out; 1220 buf = gs->tree_buf; 1221 work = gs->tree_work; 1222 size = gs->tree_nel*step; 1223 1224 /* zero out collection buffer */ 1225 PCTFS_rvec_zero(buf,size); 1226 1227 /* copy over my contributions */ 1228 while (*in >= 0) { 1229 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1230 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1)); 1231 } 1232 1233 /* perform fan in/out on full buffer */ 1234 /* must change PCTFS_grop to handle the blas */ 1235 PCTFS_grop(buf,work,size,op); 1236 1237 /* reset */ 1238 in = gs->tree_map_in; 1239 out = gs->tree_map_out; 1240 1241 /* get the portion of the results I need */ 1242 while (*in >= 0) { 1243 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1244 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1)); 1245 } 1246 PetscFunctionReturn(0); 1247 } 1248 1249 /******************************************************************************/ 1250 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1251 { 1252 PetscErrorCode ierr; 1253 1254 PetscFunctionBegin; 1255 switch (*op) { 1256 case '+': 1257 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1258 break; 1259 default: 1260 ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 1261 ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr); 1262 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1263 break; 1264 } 1265 PetscFunctionReturn(0); 1266 } 1267 1268 /******************************************************************************/ 1269 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1270 { 1271 PetscFunctionBegin; 1272 /* if there's nothing to do return */ 1273 if (dim<=0) PetscFunctionReturn(0); 1274 1275 /* can't do more dimensions then exist */ 1276 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1277 1278 /* local only operations!!! */ 1279 if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals); 1280 1281 /* if intersection tree/pairwise and local isn't empty */ 1282 if (gs->num_local_gop) { 1283 PCTFS_gs_gop_local_in_plus(gs,vals); 1284 1285 /* pairwise will do tree inside ... */ 1286 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */ 1287 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1288 1289 PCTFS_gs_gop_local_out(gs,vals); 1290 } else { /* if intersection tree/pairwise and local is empty */ 1291 /* pairwise will do tree inside */ 1292 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */ 1293 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1294 } 1295 PetscFunctionReturn(0); 1296 } 1297 1298 /******************************************************************************/ 1299 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1300 { 1301 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1302 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1303 PetscInt *pw, *list, *size, **nodes; 1304 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1305 MPI_Status status; 1306 PetscInt i, mask=1; 1307 PetscErrorCode ierr; 1308 1309 PetscFunctionBegin; 1310 for (i=1; i<dim; i++) { mask<<=1; mask++; } 1311 1312 /* strip and load s */ 1313 msg_list = list = gs->pair_list; 1314 msg_size = size = gs->msg_sizes; 1315 msg_nodes = nodes = gs->node_list; 1316 iptr = pw = gs->pw_elm_list; 1317 dptr1 = dptr3 = gs->pw_vals; 1318 msg_ids_in = ids_in = gs->msg_ids_in; 1319 msg_ids_out = ids_out = gs->msg_ids_out; 1320 dptr2 = gs->out; 1321 in1 = in2 = gs->in; 1322 1323 /* post the receives */ 1324 /* msg_nodes=nodes; */ 1325 do { 1326 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1327 second one *list and do list++ afterwards */ 1328 if ((PCTFS_my_id|mask)==(*list|mask)) { 1329 ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr); 1330 list++; msg_ids_in++;in1 += *size++; 1331 } else { list++; size++; } 1332 } while (*++msg_nodes); 1333 1334 /* load gs values into in out gs buffers */ 1335 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1336 1337 /* load out buffers and post the sends */ 1338 msg_nodes=nodes; 1339 list = msg_list; 1340 while ((iptr = *msg_nodes++)) { 1341 if ((PCTFS_my_id|mask)==(*list|mask)) { 1342 dptr3 = dptr2; 1343 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1344 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1345 /* is msg_ids_out++ correct? */ 1346 ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr); 1347 msg_size++;list++;msg_ids_out++; 1348 } else {list++; msg_size++;} 1349 } 1350 1351 /* do the tree while we're waiting */ 1352 if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); 1353 1354 /* process the received data */ 1355 msg_nodes=nodes; 1356 list = msg_list; 1357 while ((iptr = *nodes++)) { 1358 if ((PCTFS_my_id|mask)==(*list|mask)) { 1359 /* Should I check the return value of MPI_Wait() or status? */ 1360 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1361 ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr); 1362 ids_in++; 1363 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1364 } 1365 list++; 1366 } 1367 1368 /* replace vals */ 1369 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1370 1371 /* clear isend message handles */ 1372 /* This changed for clarity though it could be the same */ 1373 while (*msg_nodes++) { 1374 if ((PCTFS_my_id|mask)==(*msg_list|mask)) { 1375 /* Should I check the return value of MPI_Wait() or status? */ 1376 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1377 ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr); 1378 ids_out++; 1379 } 1380 msg_list++; 1381 } 1382 PetscFunctionReturn(0); 1383 } 1384 1385 /******************************************************************************/ 1386 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1387 { 1388 PetscInt size; 1389 PetscInt *in, *out; 1390 PetscScalar *buf, *work; 1391 PetscInt op[] = {GL_ADD,0}; 1392 1393 PetscFunctionBegin; 1394 in = gs->tree_map_in; 1395 out = gs->tree_map_out; 1396 buf = gs->tree_buf; 1397 work = gs->tree_work; 1398 size = gs->tree_nel; 1399 1400 PCTFS_rvec_zero(buf,size); 1401 1402 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1403 1404 in = gs->tree_map_in; 1405 out = gs->tree_map_out; 1406 1407 PCTFS_grop_hc(buf,work,size,op,dim); 1408 1409 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1410 PetscFunctionReturn(0); 1411 } 1412 1413