1 2 /***********************************gs.c*************************************** 3 4 Author: Henry M. Tufo III 5 6 e-mail: hmt@cs.brown.edu 7 8 snail-mail: 9 Division of Applied Mathematics 10 Brown University 11 Providence, RI 02912 12 13 Last Modification: 14 6.21.97 15 ************************************gs.c**************************************/ 16 17 /***********************************gs.c*************************************** 18 File Description: 19 ----------------- 20 21 ************************************gs.c**************************************/ 22 23 #include <../src/ksp/pc/impls/tfs/tfs.h> 24 25 /* default length of number of items via tree - doubles if exceeded */ 26 #define TREE_BUF_SZ 2048; 27 #define GS_VEC_SZ 1 28 29 /***********************************gs.c*************************************** 30 Type: struct gather_scatter_id 31 ------------------------------ 32 33 ************************************gs.c**************************************/ 34 typedef struct gather_scatter_id { 35 PetscInt id; 36 PetscInt nel_min; 37 PetscInt nel_max; 38 PetscInt nel_sum; 39 PetscInt negl; 40 PetscInt gl_max; 41 PetscInt gl_min; 42 PetscInt repeats; 43 PetscInt ordered; 44 PetscInt positive; 45 PetscScalar *vals; 46 47 /* bit mask info */ 48 PetscInt *my_proc_mask; 49 PetscInt mask_sz; 50 PetscInt *ngh_buf; 51 PetscInt ngh_buf_sz; 52 PetscInt *nghs; 53 PetscInt num_nghs; 54 PetscInt max_nghs; 55 PetscInt *pw_nghs; 56 PetscInt num_pw_nghs; 57 PetscInt *tree_nghs; 58 PetscInt num_tree_nghs; 59 60 PetscInt num_loads; 61 62 /* repeats == true -> local info */ 63 PetscInt nel; /* number of unique elememts */ 64 PetscInt *elms; /* of size nel */ 65 PetscInt nel_total; 66 PetscInt *local_elms; /* of size nel_total */ 67 PetscInt *companion; /* of size nel_total */ 68 69 /* local info */ 70 PetscInt num_local_total; 71 PetscInt local_strength; 72 PetscInt num_local; 73 PetscInt *num_local_reduce; 74 PetscInt **local_reduce; 75 PetscInt num_local_gop; 76 PetscInt *num_gop_local_reduce; 77 PetscInt **gop_local_reduce; 78 79 /* pairwise info */ 80 PetscInt level; 81 PetscInt num_pairs; 82 PetscInt max_pairs; 83 PetscInt loc_node_pairs; 84 PetscInt max_node_pairs; 85 PetscInt min_node_pairs; 86 PetscInt avg_node_pairs; 87 PetscInt *pair_list; 88 PetscInt *msg_sizes; 89 PetscInt **node_list; 90 PetscInt len_pw_list; 91 PetscInt *pw_elm_list; 92 PetscScalar *pw_vals; 93 94 MPI_Request *msg_ids_in; 95 MPI_Request *msg_ids_out; 96 97 PetscScalar *out; 98 PetscScalar *in; 99 PetscInt msg_total; 100 101 /* tree - crystal accumulator info */ 102 PetscInt max_left_over; 103 PetscInt *pre; 104 PetscInt *in_num; 105 PetscInt *out_num; 106 PetscInt **in_list; 107 PetscInt **out_list; 108 109 /* new tree work*/ 110 PetscInt tree_nel; 111 PetscInt *tree_elms; 112 PetscScalar *tree_buf; 113 PetscScalar *tree_work; 114 115 PetscInt tree_map_sz; 116 PetscInt *tree_map_in; 117 PetscInt *tree_map_out; 118 119 /* current memory status */ 120 PetscInt gl_bss_min; 121 PetscInt gl_perm_min; 122 123 /* max segment size for PCTFS_gs_gop_vec() */ 124 PetscInt vec_sz; 125 126 /* hack to make paul happy */ 127 MPI_Comm PCTFS_gs_comm; 128 129 } PCTFS_gs_id; 130 131 static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level); 132 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs); 133 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs); 134 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs); 135 static PCTFS_gs_id *gsi_new(void); 136 static PetscErrorCode set_tree(PCTFS_gs_id *gs); 137 138 /* same for all but vector flavor */ 139 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals); 140 /* vector flavor */ 141 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 142 143 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 144 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step); 145 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 146 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 147 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step); 148 149 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals); 150 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals); 151 152 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 153 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim); 154 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim); 155 156 /* global vars */ 157 /* from comm.c module */ 158 159 static PetscInt num_gs_ids = 0; 160 161 /* should make this dynamic ... later */ 162 static PetscInt msg_buf =MAX_MSG_BUF; 163 static PetscInt vec_sz =GS_VEC_SZ; 164 static PetscInt *tree_buf =NULL; 165 static PetscInt tree_buf_sz=0; 166 static PetscInt ntree =0; 167 168 /***************************************************************************/ 169 PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size) 170 { 171 PetscFunctionBegin; 172 vec_sz = size; 173 PetscFunctionReturn(0); 174 } 175 176 /******************************************************************************/ 177 PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size) 178 { 179 PetscFunctionBegin; 180 msg_buf = buf_size; 181 PetscFunctionReturn(0); 182 } 183 184 /******************************************************************************/ 185 PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level) 186 { 187 PCTFS_gs_id *gs; 188 MPI_Group PCTFS_gs_group; 189 MPI_Comm PCTFS_gs_comm; 190 PetscErrorCode ierr; 191 192 /* ensure that communication package has been initialized */ 193 PCTFS_comm_init(); 194 195 /* determines if we have enough dynamic/semi-static memory */ 196 /* checks input, allocs and sets gd_id template */ 197 gs = gsi_check_args(elms,nel,level); 198 199 /* only bit mask version up and working for the moment */ 200 /* LATER :: get int list version working for sparse pblms */ 201 ierr = gsi_via_bit_mask(gs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 202 203 ierr = MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 204 ierr = MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);CHKERRABORT(PETSC_COMM_WORLD,ierr); 205 ierr = MPI_Group_free(&PCTFS_gs_group);CHKERRABORT(PETSC_COMM_WORLD,ierr); 206 207 gs->PCTFS_gs_comm=PCTFS_gs_comm; 208 209 return(gs); 210 } 211 212 /******************************************************************************/ 213 static PCTFS_gs_id *gsi_new(void) 214 { 215 PetscErrorCode ierr; 216 PCTFS_gs_id *gs; 217 gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id)); 218 ierr = PetscMemzero(gs,sizeof(PCTFS_gs_id));CHKERRABORT(PETSC_COMM_WORLD,ierr); 219 return(gs); 220 } 221 222 /******************************************************************************/ 223 static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level) 224 { 225 PetscInt i, j, k, t2; 226 PetscInt *companion, *elms, *unique, *iptr; 227 PetscInt num_local=0, *num_to_reduce, **local_reduce; 228 PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND}; 229 PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1]; 230 PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1]; 231 PCTFS_gs_id *gs; 232 PetscErrorCode ierr; 233 234 if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n"); 235 if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n"); 236 237 if (nel==0) { ierr = PetscInfo(0,"I don't have any elements!!!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 238 239 /* get space for gs template */ 240 gs = gsi_new(); 241 gs->id = ++num_gs_ids; 242 243 /* hmt 6.4.99 */ 244 /* caller can set global ids that don't participate to 0 */ 245 /* PCTFS_gs_init ignores all zeros in elm list */ 246 /* negative global ids are still invalid */ 247 for (i=j=0; i<nel; i++) { 248 if (in_elms[i]!=0) j++; 249 } 250 251 k=nel; nel=j; 252 253 /* copy over in_elms list and create inverse map */ 254 elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt)); 255 companion = (PetscInt*) malloc(nel*sizeof(PetscInt)); 256 257 for (i=j=0; i<k; i++) { 258 if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; } 259 } 260 261 if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n"); 262 263 /* pre-pass ... check to see if sorted */ 264 elms[nel] = INT_MAX; 265 iptr = elms; 266 unique = elms+1; 267 j =0; 268 while (*iptr!=INT_MAX) { 269 if (*iptr++>*unique++) { j=1; break; } 270 } 271 272 /* set up inverse map */ 273 if (j) { 274 ierr = PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); 275 ierr = PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);CHKERRABORT(PETSC_COMM_WORLD,ierr); 276 } else { ierr = PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");CHKERRABORT(PETSC_COMM_WORLD,ierr); } 277 elms[nel] = INT_MIN; 278 279 /* first pass */ 280 /* determine number of unique elements, check pd */ 281 for (i=k=0; i<nel; i+=j) { 282 t2 = elms[i]; 283 j = ++i; 284 285 /* clump 'em for now */ 286 while (elms[j]==t2) j++; 287 288 /* how many together and num local */ 289 if (j-=i) { num_local++; k+=j; } 290 } 291 292 /* how many unique elements? */ 293 gs->repeats = k; 294 gs->nel = nel-k; 295 296 /* number of repeats? */ 297 gs->num_local = num_local; 298 num_local += 2; 299 gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*)); 300 gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt)); 301 302 unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt)); 303 gs->elms = unique; 304 gs->nel_total = nel; 305 gs->local_elms = elms; 306 gs->companion = companion; 307 308 /* compess map as well as keep track of local ops */ 309 for (num_local=i=j=0; i<gs->nel; i++) { 310 k = j; 311 t2 = unique[i] = elms[j]; 312 companion[i] = companion[j]; 313 314 while (elms[j]==t2) j++; 315 316 if ((t2=(j-k))>1) { 317 /* number together */ 318 num_to_reduce[num_local] = t2++; 319 320 iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt)); 321 322 /* to use binary searching don't remap until we check intersection */ 323 *iptr++ = i; 324 325 /* note that we're skipping the first one */ 326 while (++k<j) *(iptr++) = companion[k]; 327 *iptr = -1; 328 } 329 } 330 331 /* sentinel for ngh_buf */ 332 unique[gs->nel]=INT_MAX; 333 334 /* for two partition sort hack */ 335 num_to_reduce[num_local] = 0; 336 local_reduce[num_local] = NULL; 337 num_to_reduce[++num_local] = 0; 338 local_reduce[num_local] = NULL; 339 340 /* load 'em up */ 341 /* note one extra to hold NON_UNIFORM flag!!! */ 342 vals[2] = vals[1] = vals[0] = nel; 343 if (gs->nel>0) { 344 vals[3] = unique[0]; 345 vals[4] = unique[gs->nel-1]; 346 } else { 347 vals[3] = INT_MAX; 348 vals[4] = INT_MIN; 349 } 350 vals[5] = level; 351 vals[6] = num_gs_ids; 352 353 /* GLOBAL: send 'em out */ 354 ierr = PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);CHKERRABORT(PETSC_COMM_WORLD,ierr); 355 356 /* must be semi-pos def - only pairwise depends on this */ 357 /* LATER - remove this restriction */ 358 if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n"); 359 if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n"); 360 361 gs->nel_min = vals[0]; 362 gs->nel_max = vals[1]; 363 gs->nel_sum = vals[2]; 364 gs->gl_min = vals[3]; 365 gs->gl_max = vals[4]; 366 gs->negl = vals[4]-vals[3]+1; 367 368 if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n"); 369 370 /* LATER :: add level == -1 -> program selects level */ 371 if (vals[5]<0) vals[5]=0; 372 else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes; 373 gs->level = vals[5]; 374 375 return(gs); 376 } 377 378 /******************************************************************************/ 379 static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs) 380 { 381 PetscInt i, nel, *elms; 382 PetscInt t1; 383 PetscInt **reduce; 384 PetscInt *map; 385 PetscErrorCode ierr; 386 387 PetscFunctionBegin; 388 /* totally local removes ... PCTFS_ct_bits == 0 */ 389 get_ngh_buf(gs); 390 391 if (gs->level) set_pairwise(gs); 392 if (gs->max_left_over) set_tree(gs); 393 394 /* intersection local and pairwise/tree? */ 395 gs->num_local_total = gs->num_local; 396 gs->gop_local_reduce = gs->local_reduce; 397 gs->num_gop_local_reduce = gs->num_local_reduce; 398 399 map = gs->companion; 400 401 /* is there any local compression */ 402 if (!gs->num_local) { 403 gs->local_strength = NONE; 404 gs->num_local_gop = 0; 405 } else { 406 /* ok find intersection */ 407 map = gs->companion; 408 reduce = gs->local_reduce; 409 for (i=0, t1=0; i<gs->num_local; i++, reduce++) { 410 if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) { 411 t1++; 412 if (gs->num_local_reduce[i]<=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"nobody in list?"); 413 gs->num_local_reduce[i] *= -1; 414 } 415 **reduce=map[**reduce]; 416 } 417 418 /* intersection is empty */ 419 if (!t1) { 420 gs->local_strength = FULL; 421 gs->num_local_gop = 0; 422 } else { /* intersection not empty */ 423 gs->local_strength = PARTIAL; 424 425 ierr = PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);CHKERRQ(ierr); 426 427 gs->num_local_gop = t1; 428 gs->num_local_total = gs->num_local; 429 gs->num_local -= t1; 430 gs->gop_local_reduce = gs->local_reduce; 431 gs->num_gop_local_reduce = gs->num_local_reduce; 432 433 for (i=0; i<t1; i++) { 434 if (gs->num_gop_local_reduce[i]>=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"they aren't negative?"); 435 gs->num_gop_local_reduce[i] *= -1; 436 gs->local_reduce++; 437 gs->num_local_reduce++; 438 } 439 gs->local_reduce++; 440 gs->num_local_reduce++; 441 } 442 } 443 444 elms = gs->pw_elm_list; 445 nel = gs->len_pw_list; 446 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 447 448 elms = gs->tree_map_in; 449 nel = gs->tree_map_sz; 450 for (i=0; i<nel; i++) elms[i] = map[elms[i]]; 451 452 /* clean up */ 453 free((void*) gs->local_elms); 454 free((void*) gs->companion); 455 free((void*) gs->elms); 456 free((void*) gs->ngh_buf); 457 gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL; 458 PetscFunctionReturn(0); 459 } 460 461 /******************************************************************************/ 462 static PetscErrorCode place_in_tree(PetscInt elm) 463 { 464 PetscInt *tp, n; 465 466 PetscFunctionBegin; 467 if (ntree==tree_buf_sz) { 468 if (tree_buf_sz) { 469 tp = tree_buf; 470 n = tree_buf_sz; 471 tree_buf_sz<<=1; 472 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 473 PCTFS_ivec_copy(tree_buf,tp,n); 474 free(tp); 475 } else { 476 tree_buf_sz = TREE_BUF_SZ; 477 tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt)); 478 } 479 } 480 481 tree_buf[ntree++] = elm; 482 PetscFunctionReturn(0); 483 } 484 485 /******************************************************************************/ 486 static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs) 487 { 488 PetscInt i, j, npw=0, ntree_map=0; 489 PetscInt p_mask_size, ngh_buf_size, buf_size; 490 PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask; 491 PetscInt *ngh_buf, *buf1, *buf2; 492 PetscInt offset, per_load, num_loads, or_ct, start, end; 493 PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms; 494 PetscInt oper=GL_B_OR; 495 PetscInt *ptr3, *t_mask, level, ct1, ct2; 496 PetscErrorCode ierr; 497 498 PetscFunctionBegin; 499 /* to make life easier */ 500 nel = gs->nel; 501 elms = gs->elms; 502 level = gs->level; 503 504 /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */ 505 p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes)); 506 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 507 508 /* allocate space for masks and info bufs */ 509 gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size); 510 gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size); 511 gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel; 512 t_mask = (PetscInt*) malloc(p_mask_size); 513 gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size); 514 515 /* comm buffer size ... memory usage bounded by ~2*msg_buf */ 516 /* had thought I could exploit rendezvous threshold */ 517 518 /* default is one pass */ 519 per_load = negl = gs->negl; 520 gs->num_loads = num_loads = 1; 521 i = p_mask_size*negl; 522 523 /* possible overflow on buffer size */ 524 /* overflow hack */ 525 if (i<0) i=INT_MAX; 526 527 buf_size = PetscMin(msg_buf,i); 528 529 /* can we do it? */ 530 if (p_mask_size>buf_size) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"get_ngh_buf() :: buf<pms :: %d>%d\n",p_mask_size,buf_size); 531 532 /* get PCTFS_giop buf space ... make *only* one malloc */ 533 buf1 = (PetscInt*) malloc(buf_size<<1); 534 535 /* more than one gior exchange needed? */ 536 if (buf_size!=i) { 537 per_load = buf_size/p_mask_size; 538 buf_size = per_load*p_mask_size; 539 gs->num_loads = num_loads = negl/per_load + (negl%per_load>0); 540 } 541 542 /* convert buf sizes from #bytes to #ints - 32 bit only! */ 543 p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt); 544 545 /* find PCTFS_giop work space */ 546 buf2 = buf1+buf_size; 547 548 /* hold #ints needed for processor masks */ 549 gs->mask_sz=p_mask_size; 550 551 /* init buffers */ 552 ierr = PCTFS_ivec_zero(sh_proc_mask,p_mask_size);CHKERRQ(ierr); 553 ierr = PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);CHKERRQ(ierr); 554 ierr = PCTFS_ivec_zero(ngh_buf,ngh_buf_size);CHKERRQ(ierr); 555 556 /* HACK reset tree info */ 557 tree_buf = NULL; 558 tree_buf_sz = ntree = 0; 559 560 /* ok do it */ 561 for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) { 562 /* identity for bitwise or is 000...000 */ 563 PCTFS_ivec_zero(buf1,buf_size); 564 565 /* load msg buffer */ 566 for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) { 567 offset = (offset-start)*p_mask_size; 568 PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size); 569 } 570 571 /* GLOBAL: pass buffer */ 572 ierr = PCTFS_giop(buf1,buf2,buf_size,&oper);CHKERRQ(ierr); 573 574 /* unload buffer into ngh_buf */ 575 ptr2=(elms+i_start); 576 for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) { 577 /* I own it ... may have to pairwise it */ 578 if (j==*ptr2) { 579 /* do i share it w/anyone? */ 580 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 581 /* guess not */ 582 if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; } 583 584 /* i do ... so keep info and turn off my bit */ 585 PCTFS_ivec_copy(ptr1,ptr3,p_mask_size); 586 ierr = PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);CHKERRQ(ierr); 587 ierr = PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 588 589 /* is it to be done pairwise? */ 590 if (--ct1<=level) { 591 npw++; 592 593 /* turn on high bit to indicate pw need to process */ 594 *ptr2++ |= TOP_BIT; 595 ierr = PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);CHKERRQ(ierr); 596 ptr1 += p_mask_size; 597 continue; 598 } 599 600 /* get set for next and note that I have a tree contribution */ 601 /* could save exact elm index for tree here -> save a search */ 602 ptr2++; ptr1+=p_mask_size; ntree_map++; 603 } else { /* i don't but still might be involved in tree */ 604 605 /* shared by how many? */ 606 ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt)); 607 608 /* none! */ 609 if (ct1<2) continue; 610 611 /* is it going to be done pairwise? but not by me of course!*/ 612 if (--ct1<=level) continue; 613 } 614 /* LATER we're going to have to process it NOW */ 615 /* nope ... tree it */ 616 ierr = place_in_tree(j);CHKERRQ(ierr); 617 } 618 } 619 620 free((void*)t_mask); 621 free((void*)buf1); 622 623 gs->len_pw_list = npw; 624 gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 625 626 /* expand from bit mask list to int list and save ngh list */ 627 gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt)); 628 PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs); 629 630 gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt)); 631 632 oper = GL_MAX; 633 ct1 = gs->num_nghs; 634 ierr = PCTFS_giop(&ct1,&ct2,1,&oper);CHKERRQ(ierr); 635 gs->max_nghs = ct1; 636 637 gs->tree_map_sz = ntree_map; 638 gs->max_left_over=ntree; 639 640 free((void*)p_mask); 641 free((void*)sh_proc_mask); 642 PetscFunctionReturn(0); 643 } 644 645 /******************************************************************************/ 646 static PetscErrorCode set_pairwise(PCTFS_gs_id *gs) 647 { 648 PetscInt i, j; 649 PetscInt p_mask_size; 650 PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask; 651 PetscInt *ngh_buf, *buf2; 652 PetscInt offset; 653 PetscInt *msg_list, *msg_size, **msg_nodes, nprs; 654 PetscInt *pairwise_elm_list, len_pair_list=0; 655 PetscInt *iptr, t1, i_start, nel, *elms; 656 PetscInt ct; 657 PetscErrorCode ierr; 658 659 PetscFunctionBegin; 660 /* to make life easier */ 661 nel = gs->nel; 662 elms = gs->elms; 663 ngh_buf = gs->ngh_buf; 664 sh_proc_mask = gs->pw_nghs; 665 666 /* need a few temp masks */ 667 p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes); 668 p_mask = (PetscInt*) malloc(p_mask_size); 669 tmp_proc_mask = (PetscInt*) malloc(p_mask_size); 670 671 /* set mask to my PCTFS_my_id's bit mask */ 672 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);CHKERRQ(ierr); 673 674 p_mask_size /= sizeof(PetscInt); 675 676 len_pair_list = gs->len_pw_list; 677 gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt)); 678 679 /* how many processors (nghs) do we have to exchange with? */ 680 nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt)); 681 682 /* allocate space for PCTFS_gs_gop() info */ 683 gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 684 gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs); 685 gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1)); 686 687 /* init msg_size list */ 688 ierr = PCTFS_ivec_zero(msg_size,nprs);CHKERRQ(ierr); 689 690 /* expand from bit mask list to int list */ 691 ierr = PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);CHKERRQ(ierr); 692 693 /* keep list of elements being handled pairwise */ 694 for (i=j=0; i<nel; i++) { 695 if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; } 696 } 697 pairwise_elm_list[j] = -1; 698 699 gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 700 gs->msg_ids_out[nprs] = MPI_REQUEST_NULL; 701 gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1)); 702 gs->msg_ids_in[nprs] = MPI_REQUEST_NULL; 703 gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz); 704 705 /* find who goes to each processor */ 706 for (i_start=i=0; i<nprs; i++) { 707 /* processor i's mask */ 708 ierr = PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);CHKERRQ(ierr); 709 710 /* det # going to processor i */ 711 for (ct=j=0; j<len_pair_list; j++) { 712 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 713 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 714 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++; 715 } 716 msg_size[i] = ct; 717 i_start = PetscMax(i_start,ct); 718 719 /*space to hold nodes in message to first neighbor */ 720 msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1)); 721 722 for (j=0;j<len_pair_list;j++) { 723 buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size); 724 ierr = PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);CHKERRQ(ierr); 725 if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j; 726 } 727 *iptr = -1; 728 } 729 msg_nodes[nprs] = NULL; 730 731 j = gs->loc_node_pairs=i_start; 732 t1 = GL_MAX; 733 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 734 gs->max_node_pairs = i_start; 735 736 i_start = j; 737 t1 = GL_MIN; 738 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 739 gs->min_node_pairs = i_start; 740 741 i_start = j; 742 t1 = GL_ADD; 743 ierr = PCTFS_giop(&i_start,&offset,1,&t1);CHKERRQ(ierr); 744 gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1; 745 746 i_start = nprs; 747 t1 = GL_MAX; 748 PCTFS_giop(&i_start,&offset,1,&t1); 749 gs->max_pairs = i_start; 750 751 /* remap pairwise in tail of gsi_via_bit_mask() */ 752 gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs); 753 gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 754 gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz); 755 756 /* reset malloc pool */ 757 free((void*)p_mask); 758 free((void*)tmp_proc_mask); 759 PetscFunctionReturn(0); 760 } 761 762 /* to do pruned tree just save ngh buf copy for each one and decode here! 763 ******************************************************************************/ 764 static PetscErrorCode set_tree(PCTFS_gs_id *gs) 765 { 766 PetscInt i, j, n, nel; 767 PetscInt *iptr_in, *iptr_out, *tree_elms, *elms; 768 769 PetscFunctionBegin; 770 /* local work ptrs */ 771 elms = gs->elms; 772 nel = gs->nel; 773 774 /* how many via tree */ 775 gs->tree_nel = n = ntree; 776 gs->tree_elms = tree_elms = iptr_in = tree_buf; 777 gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 778 gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz); 779 j = gs->tree_map_sz; 780 gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 781 gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1)); 782 783 /* search the longer of the two lists */ 784 /* note ... could save this info in get_ngh_buf and save searches */ 785 if (n<=nel) { 786 /* bijective fct w/remap - search elm list */ 787 for (i=0; i<n; i++) { 788 if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;} 789 } 790 } else { 791 for (i=0; i<nel; i++) { 792 if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;} 793 } 794 } 795 796 /* sentinel */ 797 *iptr_in = *iptr_out = -1; 798 PetscFunctionReturn(0); 799 } 800 801 /******************************************************************************/ 802 static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals) 803 { 804 PetscInt *num, *map, **reduce; 805 PetscScalar tmp; 806 807 PetscFunctionBegin; 808 num = gs->num_gop_local_reduce; 809 reduce = gs->gop_local_reduce; 810 while ((map = *reduce++)) { 811 /* wall */ 812 if (*num == 2) { 813 num++; 814 vals[map[1]] = vals[map[0]]; 815 } else if (*num == 3) { /* corner shared by three elements */ 816 num++; 817 vals[map[2]] = vals[map[1]] = vals[map[0]]; 818 } else if (*num == 4) { /* corner shared by four elements */ 819 num++; 820 vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]]; 821 } else { /* general case ... odd geoms ... 3D*/ 822 num++; 823 tmp = *(vals + *map++); 824 while (*map >= 0) *(vals + *map++) = tmp; 825 } 826 } 827 PetscFunctionReturn(0); 828 } 829 830 /******************************************************************************/ 831 static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals) 832 { 833 PetscInt *num, *map, **reduce; 834 PetscScalar tmp; 835 836 PetscFunctionBegin; 837 num = gs->num_local_reduce; 838 reduce = gs->local_reduce; 839 while ((map = *reduce)) { 840 /* wall */ 841 if (*num == 2) { 842 num++; reduce++; 843 vals[map[1]] = vals[map[0]] += vals[map[1]]; 844 } else if (*num == 3) { /* corner shared by three elements */ 845 num++; reduce++; 846 vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]); 847 } else if (*num == 4) { /* corner shared by four elements */ 848 num++; reduce++; 849 vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 850 } else { /* general case ... odd geoms ... 3D*/ 851 num++; 852 tmp = 0.0; 853 while (*map >= 0) tmp += *(vals + *map++); 854 855 map = *reduce++; 856 while (*map >= 0) *(vals + *map++) = tmp; 857 } 858 } 859 PetscFunctionReturn(0); 860 } 861 862 /******************************************************************************/ 863 static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals) 864 { 865 PetscInt *num, *map, **reduce; 866 PetscScalar *base; 867 868 PetscFunctionBegin; 869 num = gs->num_gop_local_reduce; 870 reduce = gs->gop_local_reduce; 871 while ((map = *reduce++)) { 872 /* wall */ 873 if (*num == 2) { 874 num++; 875 vals[map[0]] += vals[map[1]]; 876 } else if (*num == 3) { /* corner shared by three elements */ 877 num++; 878 vals[map[0]] += (vals[map[1]] + vals[map[2]]); 879 } else if (*num == 4) { /* corner shared by four elements */ 880 num++; 881 vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]); 882 } else { /* general case ... odd geoms ... 3D*/ 883 num++; 884 base = vals + *map++; 885 while (*map >= 0) *base += *(vals + *map++); 886 } 887 } 888 PetscFunctionReturn(0); 889 } 890 891 /******************************************************************************/ 892 PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs) 893 { 894 PetscInt i; 895 PetscErrorCode ierr; 896 897 PetscFunctionBegin; 898 ierr = MPI_Comm_free(&gs->PCTFS_gs_comm);CHKERRMPI(ierr); 899 if (gs->nghs) free((void*) gs->nghs); 900 if (gs->pw_nghs) free((void*) gs->pw_nghs); 901 902 /* tree */ 903 if (gs->max_left_over) { 904 if (gs->tree_elms) free((void*) gs->tree_elms); 905 if (gs->tree_buf) free((void*) gs->tree_buf); 906 if (gs->tree_work) free((void*) gs->tree_work); 907 if (gs->tree_map_in) free((void*) gs->tree_map_in); 908 if (gs->tree_map_out) free((void*) gs->tree_map_out); 909 } 910 911 /* pairwise info */ 912 if (gs->num_pairs) { 913 /* should be NULL already */ 914 if (gs->ngh_buf) free((void*) gs->ngh_buf); 915 if (gs->elms) free((void*) gs->elms); 916 if (gs->local_elms) free((void*) gs->local_elms); 917 if (gs->companion) free((void*) gs->companion); 918 919 /* only set if pairwise */ 920 if (gs->vals) free((void*) gs->vals); 921 if (gs->in) free((void*) gs->in); 922 if (gs->out) free((void*) gs->out); 923 if (gs->msg_ids_in) free((void*) gs->msg_ids_in); 924 if (gs->msg_ids_out) free((void*) gs->msg_ids_out); 925 if (gs->pw_vals) free((void*) gs->pw_vals); 926 if (gs->pw_elm_list) free((void*) gs->pw_elm_list); 927 if (gs->node_list) { 928 for (i=0;i<gs->num_pairs;i++) { 929 if (gs->node_list[i]) { 930 free((void*) gs->node_list[i]); 931 } 932 } 933 free((void*) gs->node_list); 934 } 935 if (gs->msg_sizes) free((void*) gs->msg_sizes); 936 if (gs->pair_list) free((void*) gs->pair_list); 937 } 938 939 /* local info */ 940 if (gs->num_local_total>=0) { 941 for (i=0;i<gs->num_local_total+1;i++) { 942 if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]); 943 } 944 } 945 946 /* if intersection tree/pairwise and local isn't empty */ 947 if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce); 948 if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce); 949 950 free((void*) gs); 951 PetscFunctionReturn(0); 952 } 953 954 /******************************************************************************/ 955 PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step) 956 { 957 PetscErrorCode ierr; 958 959 PetscFunctionBegin; 960 switch (*op) { 961 case '+': 962 PCTFS_gs_gop_vec_plus(gs,vals,step); 963 break; 964 default: 965 ierr = PetscInfo1(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 966 ierr = PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");CHKERRQ(ierr); 967 PCTFS_gs_gop_vec_plus(gs,vals,step); 968 break; 969 } 970 PetscFunctionReturn(0); 971 } 972 973 /******************************************************************************/ 974 static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 975 { 976 PetscFunctionBegin; 977 if (!gs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"PCTFS_gs_gop_vec() passed NULL gs handle!!!"); 978 979 /* local only operations!!! */ 980 if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step); 981 982 /* if intersection tree/pairwise and local isn't empty */ 983 if (gs->num_local_gop) { 984 PCTFS_gs_gop_vec_local_in_plus(gs,vals,step); 985 986 /* pairwise */ 987 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 988 989 /* tree */ 990 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 991 992 PCTFS_gs_gop_vec_local_out(gs,vals,step); 993 } else { /* if intersection tree/pairwise and local is empty */ 994 /* pairwise */ 995 if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step); 996 997 /* tree */ 998 else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step); 999 } 1000 PetscFunctionReturn(0); 1001 } 1002 1003 /******************************************************************************/ 1004 static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1005 { 1006 PetscInt *num, *map, **reduce; 1007 PetscScalar *base; 1008 1009 PetscFunctionBegin; 1010 num = gs->num_local_reduce; 1011 reduce = gs->local_reduce; 1012 while ((map = *reduce)) { 1013 base = vals + map[0] * step; 1014 1015 /* wall */ 1016 if (*num == 2) { 1017 num++; reduce++; 1018 PCTFS_rvec_add (base,vals+map[1]*step,step); 1019 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1020 } else if (*num == 3) { /* corner shared by three elements */ 1021 num++; reduce++; 1022 PCTFS_rvec_add (base,vals+map[1]*step,step); 1023 PCTFS_rvec_add (base,vals+map[2]*step,step); 1024 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1025 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1026 } else if (*num == 4) { /* corner shared by four elements */ 1027 num++; reduce++; 1028 PCTFS_rvec_add (base,vals+map[1]*step,step); 1029 PCTFS_rvec_add (base,vals+map[2]*step,step); 1030 PCTFS_rvec_add (base,vals+map[3]*step,step); 1031 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1032 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1033 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1034 } else { /* general case ... odd geoms ... 3D */ 1035 num++; 1036 while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step); 1037 1038 map = *reduce; 1039 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1040 1041 reduce++; 1042 } 1043 } 1044 PetscFunctionReturn(0); 1045 } 1046 1047 /******************************************************************************/ 1048 static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1049 { 1050 PetscInt *num, *map, **reduce; 1051 PetscScalar *base; 1052 1053 PetscFunctionBegin; 1054 num = gs->num_gop_local_reduce; 1055 reduce = gs->gop_local_reduce; 1056 while ((map = *reduce++)) { 1057 base = vals + map[0] * step; 1058 1059 /* wall */ 1060 if (*num == 2) { 1061 num++; 1062 PCTFS_rvec_add(base,vals+map[1]*step,step); 1063 } else if (*num == 3) { /* corner shared by three elements */ 1064 num++; 1065 PCTFS_rvec_add(base,vals+map[1]*step,step); 1066 PCTFS_rvec_add(base,vals+map[2]*step,step); 1067 } else if (*num == 4) { /* corner shared by four elements */ 1068 num++; 1069 PCTFS_rvec_add(base,vals+map[1]*step,step); 1070 PCTFS_rvec_add(base,vals+map[2]*step,step); 1071 PCTFS_rvec_add(base,vals+map[3]*step,step); 1072 } else { /* general case ... odd geoms ... 3D*/ 1073 num++; 1074 while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step); 1075 } 1076 } 1077 PetscFunctionReturn(0); 1078 } 1079 1080 /******************************************************************************/ 1081 static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1082 { 1083 PetscInt *num, *map, **reduce; 1084 PetscScalar *base; 1085 1086 PetscFunctionBegin; 1087 num = gs->num_gop_local_reduce; 1088 reduce = gs->gop_local_reduce; 1089 while ((map = *reduce++)) { 1090 base = vals + map[0] * step; 1091 1092 /* wall */ 1093 if (*num == 2) { 1094 num++; 1095 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1096 } else if (*num == 3) { /* corner shared by three elements */ 1097 num++; 1098 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1099 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1100 } else if (*num == 4) { /* corner shared by four elements */ 1101 num++; 1102 PCTFS_rvec_copy(vals+map[1]*step,base,step); 1103 PCTFS_rvec_copy(vals+map[2]*step,base,step); 1104 PCTFS_rvec_copy(vals+map[3]*step,base,step); 1105 } else { /* general case ... odd geoms ... 3D*/ 1106 num++; 1107 while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step); 1108 } 1109 } 1110 PetscFunctionReturn(0); 1111 } 1112 1113 /******************************************************************************/ 1114 static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step) 1115 { 1116 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1117 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1118 PetscInt *pw, *list, *size, **nodes; 1119 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1120 MPI_Status status; 1121 PetscBLASInt i1 = 1,dstep; 1122 PetscErrorCode ierr; 1123 1124 PetscFunctionBegin; 1125 /* strip and load s */ 1126 msg_list = list = gs->pair_list; 1127 msg_size = size = gs->msg_sizes; 1128 msg_nodes = nodes = gs->node_list; 1129 iptr = pw = gs->pw_elm_list; 1130 dptr1 = dptr3 = gs->pw_vals; 1131 msg_ids_in = ids_in = gs->msg_ids_in; 1132 msg_ids_out = ids_out = gs->msg_ids_out; 1133 dptr2 = gs->out; 1134 in1=in2 = gs->in; 1135 1136 /* post the receives */ 1137 /* msg_nodes=nodes; */ 1138 do { 1139 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1140 second one *list and do list++ afterwards */ 1141 ierr = MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr); 1142 list++;msg_ids_in++; 1143 in1 += *size++ *step; 1144 } while (*++msg_nodes); 1145 msg_nodes=nodes; 1146 1147 /* load gs values into in out gs buffers */ 1148 while (*iptr >= 0) { 1149 PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step); 1150 dptr3+=step; 1151 iptr++; 1152 } 1153 1154 /* load out buffers and post the sends */ 1155 while ((iptr = *msg_nodes++)) { 1156 dptr3 = dptr2; 1157 while (*iptr >= 0) { 1158 PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step); 1159 dptr2+=step; 1160 iptr++; 1161 } 1162 ierr = MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr); 1163 msg_size++; msg_list++;msg_ids_out++; 1164 } 1165 1166 /* tree */ 1167 if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step); 1168 1169 /* process the received data */ 1170 msg_nodes=nodes; 1171 while ((iptr = *nodes++)) { 1172 PetscScalar d1 = 1.0; 1173 1174 /* Should I check the return value of MPI_Wait() or status? */ 1175 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1176 ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr); 1177 ids_in++; 1178 while (*iptr >= 0) { 1179 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1180 PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1)); 1181 in2+=step; 1182 iptr++; 1183 } 1184 } 1185 1186 /* replace vals */ 1187 while (*pw >= 0) { 1188 PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step); 1189 dptr1+=step; 1190 pw++; 1191 } 1192 1193 /* clear isend message handles */ 1194 /* This changed for clarity though it could be the same */ 1195 1196 /* Should I check the return value of MPI_Wait() or status? */ 1197 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1198 while (*msg_nodes++) { 1199 ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr); 1200 ids_out++; 1201 } 1202 PetscFunctionReturn(0); 1203 } 1204 1205 /******************************************************************************/ 1206 static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step) 1207 { 1208 PetscInt size, *in, *out; 1209 PetscScalar *buf, *work; 1210 PetscInt op[] = {GL_ADD,0}; 1211 PetscBLASInt i1 = 1; 1212 PetscErrorCode ierr; 1213 PetscBLASInt dstep; 1214 1215 PetscFunctionBegin; 1216 /* copy over to local variables */ 1217 in = gs->tree_map_in; 1218 out = gs->tree_map_out; 1219 buf = gs->tree_buf; 1220 work = gs->tree_work; 1221 size = gs->tree_nel*step; 1222 1223 /* zero out collection buffer */ 1224 PCTFS_rvec_zero(buf,size); 1225 1226 /* copy over my contributions */ 1227 while (*in >= 0) { 1228 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1229 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1)); 1230 } 1231 1232 /* perform fan in/out on full buffer */ 1233 /* must change PCTFS_grop to handle the blas */ 1234 PCTFS_grop(buf,work,size,op); 1235 1236 /* reset */ 1237 in = gs->tree_map_in; 1238 out = gs->tree_map_out; 1239 1240 /* get the portion of the results I need */ 1241 while (*in >= 0) { 1242 ierr = PetscBLASIntCast(step,&dstep);CHKERRQ(ierr); 1243 PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1)); 1244 } 1245 PetscFunctionReturn(0); 1246 } 1247 1248 /******************************************************************************/ 1249 PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim) 1250 { 1251 PetscErrorCode ierr; 1252 1253 PetscFunctionBegin; 1254 switch (*op) { 1255 case '+': 1256 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1257 break; 1258 default: 1259 ierr = PetscInfo1(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);CHKERRQ(ierr); 1260 ierr = PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");CHKERRQ(ierr); 1261 PCTFS_gs_gop_plus_hc(gs,vals,dim); 1262 break; 1263 } 1264 PetscFunctionReturn(0); 1265 } 1266 1267 /******************************************************************************/ 1268 static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1269 { 1270 PetscFunctionBegin; 1271 /* if there's nothing to do return */ 1272 if (dim<=0) PetscFunctionReturn(0); 1273 1274 /* can't do more dimensions then exist */ 1275 dim = PetscMin(dim,PCTFS_i_log2_num_nodes); 1276 1277 /* local only operations!!! */ 1278 if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals); 1279 1280 /* if intersection tree/pairwise and local isn't empty */ 1281 if (gs->num_local_gop) { 1282 PCTFS_gs_gop_local_in_plus(gs,vals); 1283 1284 /* pairwise will do tree inside ... */ 1285 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */ 1286 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1287 1288 PCTFS_gs_gop_local_out(gs,vals); 1289 } else { /* if intersection tree/pairwise and local is empty */ 1290 /* pairwise will do tree inside */ 1291 if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */ 1292 else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim); 1293 } 1294 PetscFunctionReturn(0); 1295 } 1296 1297 /******************************************************************************/ 1298 static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim) 1299 { 1300 PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2; 1301 PetscInt *iptr, *msg_list, *msg_size, **msg_nodes; 1302 PetscInt *pw, *list, *size, **nodes; 1303 MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out; 1304 MPI_Status status; 1305 PetscInt i, mask=1; 1306 PetscErrorCode ierr; 1307 1308 PetscFunctionBegin; 1309 for (i=1; i<dim; i++) { mask<<=1; mask++; } 1310 1311 /* strip and load s */ 1312 msg_list = list = gs->pair_list; 1313 msg_size = size = gs->msg_sizes; 1314 msg_nodes = nodes = gs->node_list; 1315 iptr = pw = gs->pw_elm_list; 1316 dptr1 = dptr3 = gs->pw_vals; 1317 msg_ids_in = ids_in = gs->msg_ids_in; 1318 msg_ids_out = ids_out = gs->msg_ids_out; 1319 dptr2 = gs->out; 1320 in1 = in2 = gs->in; 1321 1322 /* post the receives */ 1323 /* msg_nodes=nodes; */ 1324 do { 1325 /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the 1326 second one *list and do list++ afterwards */ 1327 if ((PCTFS_my_id|mask)==(*list|mask)) { 1328 ierr = MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);CHKERRMPI(ierr); 1329 list++; msg_ids_in++;in1 += *size++; 1330 } else { list++; size++; } 1331 } while (*++msg_nodes); 1332 1333 /* load gs values into in out gs buffers */ 1334 while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++); 1335 1336 /* load out buffers and post the sends */ 1337 msg_nodes=nodes; 1338 list = msg_list; 1339 while ((iptr = *msg_nodes++)) { 1340 if ((PCTFS_my_id|mask)==(*list|mask)) { 1341 dptr3 = dptr2; 1342 while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++); 1343 /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */ 1344 /* is msg_ids_out++ correct? */ 1345 ierr = MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);CHKERRMPI(ierr); 1346 msg_size++;list++;msg_ids_out++; 1347 } else {list++; msg_size++;} 1348 } 1349 1350 /* do the tree while we're waiting */ 1351 if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim); 1352 1353 /* process the received data */ 1354 msg_nodes=nodes; 1355 list = msg_list; 1356 while ((iptr = *nodes++)) { 1357 if ((PCTFS_my_id|mask)==(*list|mask)) { 1358 /* Should I check the return value of MPI_Wait() or status? */ 1359 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1360 ierr = MPI_Wait(ids_in, &status);CHKERRMPI(ierr); 1361 ids_in++; 1362 while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++; 1363 } 1364 list++; 1365 } 1366 1367 /* replace vals */ 1368 while (*pw >= 0) *(in_vals + *pw++) = *dptr1++; 1369 1370 /* clear isend message handles */ 1371 /* This changed for clarity though it could be the same */ 1372 while (*msg_nodes++) { 1373 if ((PCTFS_my_id|mask)==(*msg_list|mask)) { 1374 /* Should I check the return value of MPI_Wait() or status? */ 1375 /* Can this loop be replaced by a call to MPI_Waitall()? */ 1376 ierr = MPI_Wait(ids_out, &status);CHKERRMPI(ierr); 1377 ids_out++; 1378 } 1379 msg_list++; 1380 } 1381 PetscFunctionReturn(0); 1382 } 1383 1384 /******************************************************************************/ 1385 static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim) 1386 { 1387 PetscInt size; 1388 PetscInt *in, *out; 1389 PetscScalar *buf, *work; 1390 PetscInt op[] = {GL_ADD,0}; 1391 1392 PetscFunctionBegin; 1393 in = gs->tree_map_in; 1394 out = gs->tree_map_out; 1395 buf = gs->tree_buf; 1396 work = gs->tree_work; 1397 size = gs->tree_nel; 1398 1399 PCTFS_rvec_zero(buf,size); 1400 1401 while (*in >= 0) *(buf + *out++) = *(vals + *in++); 1402 1403 in = gs->tree_map_in; 1404 out = gs->tree_map_out; 1405 1406 PCTFS_grop_hc(buf,work,size,op,dim); 1407 1408 while (*in >= 0) *(vals + *in++) = *(buf + *out++); 1409 PetscFunctionReturn(0); 1410 } 1411 1412