1 #define PETSCKSP_DLL 2 3 /*************************************xyt.c************************************ 4 Module Name: xyt 5 Module Info: 6 7 author: Henry M. Tufo III 8 e-mail: hmt@asci.uchicago.edu 9 contact: 10 +--------------------------------+--------------------------------+ 11 |MCS Division - Building 221 |Department of Computer Science | 12 |Argonne National Laboratory |Ryerson 152 | 13 |9700 S. Cass Avenue |The University of Chicago | 14 |Argonne, IL 60439 |Chicago, IL 60637 | 15 |(630) 252-5354/5986 ph/fx |(773) 702-6019/8487 ph/fx | 16 +--------------------------------+--------------------------------+ 17 18 Last Modification: 3.20.01 19 **************************************xyt.c***********************************/ 20 21 22 /*************************************xyt.c************************************ 23 NOTES ON USAGE: 24 25 **************************************xyt.c***********************************/ 26 #include "src/ksp/pc/impls/tfs/tfs.h" 27 28 #define LEFT -1 29 #define RIGHT 1 30 #define BOTH 0 31 #define MAX_FORTRAN_HANDLES 10 32 33 typedef struct xyt_solver_info { 34 int n, m, n_global, m_global; 35 int nnz, max_nnz, msg_buf_sz; 36 int *nsep, *lnsep, *fo, nfo, *stages; 37 int *xcol_sz, *xcol_indices; 38 PetscScalar **xcol_vals, *x, *solve_uu, *solve_w; 39 int *ycol_sz, *ycol_indices; 40 PetscScalar **ycol_vals, *y; 41 int nsolves; 42 PetscScalar tot_solve_time; 43 } xyt_info; 44 45 46 typedef struct matvec_info { 47 int n, m, n_global, m_global; 48 int *local2global; 49 gs_ADT gs_handle; 50 PetscErrorCode (*matvec)(struct matvec_info*,PetscScalar*,PetscScalar*); 51 void *grid_data; 52 } mv_info; 53 54 struct xyt_CDT{ 55 int id; 56 int ns; 57 int level; 58 xyt_info *info; 59 mv_info *mvi; 60 }; 61 62 static int n_xyt=0; 63 static int n_xyt_handles=0; 64 65 /* prototypes */ 66 static void do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *rhs); 67 static void check_init(void); 68 static void check_handle(xyt_ADT xyt_handle); 69 static void det_separators(xyt_ADT xyt_handle); 70 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u); 71 static int xyt_generate(xyt_ADT xyt_handle); 72 static int do_xyt_factor(xyt_ADT xyt_handle); 73 static mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data); 74 75 76 /*************************************xyt.c************************************ 77 Function: XYT_new() 78 79 Input : 80 Output: 81 Return: 82 Description: 83 **************************************xyt.c***********************************/ 84 xyt_ADT 85 XYT_new(void) 86 { 87 xyt_ADT xyt_handle; 88 89 90 91 /* rolling count on n_xyt ... pot. problem here */ 92 n_xyt_handles++; 93 xyt_handle = (xyt_ADT)malloc(sizeof(struct xyt_CDT)); 94 xyt_handle->id = ++n_xyt; 95 xyt_handle->info = NULL; 96 xyt_handle->mvi = NULL; 97 98 return(xyt_handle); 99 } 100 101 102 /*************************************xyt.c************************************ 103 Function: XYT_factor() 104 105 Input : 106 Output: 107 Return: 108 Description: 109 **************************************xyt.c***********************************/ 110 int 111 XYT_factor(xyt_ADT xyt_handle, /* prev. allocated xyt handle */ 112 int *local2global, /* global column mapping */ 113 int n, /* local num rows */ 114 int m, /* local num cols */ 115 void *matvec, /* b_loc=A_local.x_loc */ 116 void *grid_data /* grid data for matvec */ 117 ) 118 { 119 120 check_init(); 121 check_handle(xyt_handle); 122 123 /* only 2^k for now and all nodes participating */ 124 if ((1<<(xyt_handle->level=i_log2_num_nodes))!=num_nodes) 125 {error_msg_fatal("only 2^k for now and MPI_COMM_WORLD!!! %d != %d\n",1<<i_log2_num_nodes,num_nodes);} 126 127 /* space for X info */ 128 xyt_handle->info = (xyt_info*)malloc(sizeof(xyt_info)); 129 130 /* set up matvec handles */ 131 xyt_handle->mvi = set_mvi(local2global, n, m, matvec, grid_data); 132 133 /* matrix is assumed to be of full rank */ 134 /* LATER we can reset to indicate rank def. */ 135 xyt_handle->ns=0; 136 137 /* determine separators and generate firing order - NB xyt info set here */ 138 det_separators(xyt_handle); 139 140 return(do_xyt_factor(xyt_handle)); 141 } 142 143 144 /*************************************xyt.c************************************ 145 Function: XYT_solve 146 147 Input : 148 Output: 149 Return: 150 Description: 151 **************************************xyt.c***********************************/ 152 int 153 XYT_solve(xyt_ADT xyt_handle, double *x, double *b) 154 { 155 check_init(); 156 check_handle(xyt_handle); 157 158 /* need to copy b into x? */ 159 if (b) 160 {rvec_copy(x,b,xyt_handle->mvi->n);} 161 do_xyt_solve(xyt_handle,x); 162 163 return(0); 164 } 165 166 167 /*************************************xyt.c************************************ 168 Function: XYT_free() 169 170 Input : 171 Output: 172 Return: 173 Description: 174 **************************************xyt.c***********************************/ 175 int 176 XYT_free(xyt_ADT xyt_handle) 177 { 178 check_init(); 179 check_handle(xyt_handle); 180 n_xyt_handles--; 181 182 free(xyt_handle->info->nsep); 183 free(xyt_handle->info->lnsep); 184 free(xyt_handle->info->fo); 185 free(xyt_handle->info->stages); 186 free(xyt_handle->info->solve_uu); 187 free(xyt_handle->info->solve_w); 188 free(xyt_handle->info->x); 189 free(xyt_handle->info->xcol_vals); 190 free(xyt_handle->info->xcol_sz); 191 free(xyt_handle->info->xcol_indices); 192 free(xyt_handle->info->y); 193 free(xyt_handle->info->ycol_vals); 194 free(xyt_handle->info->ycol_sz); 195 free(xyt_handle->info->ycol_indices); 196 free(xyt_handle->info); 197 free(xyt_handle->mvi->local2global); 198 gs_free(xyt_handle->mvi->gs_handle); 199 free(xyt_handle->mvi); 200 free(xyt_handle); 201 202 203 /* if the check fails we nuke */ 204 /* if NULL pointer passed to free we nuke */ 205 /* if the calls to free fail that's not my problem */ 206 return(0); 207 } 208 209 210 211 /*************************************xyt.c************************************ 212 Function: 213 214 Input : 215 Output: 216 Return: 217 Description: 218 **************************************xyt.c***********************************/ 219 int 220 XYT_stats(xyt_ADT xyt_handle) 221 { 222 int op[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_ADD}; 223 int fop[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD}; 224 int vals[9], work[9]; 225 PetscScalar fvals[3], fwork[3]; 226 227 228 check_init(); 229 check_handle(xyt_handle); 230 231 /* if factorization not done there are no stats */ 232 if (!xyt_handle->info||!xyt_handle->mvi) 233 { 234 if (!my_id) 235 {printf("XYT_stats() :: no stats available!\n");} 236 return 1; 237 } 238 239 vals[0]=vals[1]=vals[2]=xyt_handle->info->nnz; 240 vals[3]=vals[4]=vals[5]=xyt_handle->mvi->n; 241 vals[6]=vals[7]=vals[8]=xyt_handle->info->msg_buf_sz; 242 giop(vals,work,sizeof(op)/sizeof(op[0])-1,op); 243 244 fvals[0]=fvals[1]=fvals[2] 245 =xyt_handle->info->tot_solve_time/xyt_handle->info->nsolves++; 246 grop(fvals,fwork,sizeof(fop)/sizeof(fop[0])-1,fop); 247 248 if (!my_id) 249 { 250 printf("%d :: min xyt_nnz=%d\n",my_id,vals[0]); 251 printf("%d :: max xyt_nnz=%d\n",my_id,vals[1]); 252 printf("%d :: avg xyt_nnz=%g\n",my_id,1.0*vals[2]/num_nodes); 253 printf("%d :: tot xyt_nnz=%d\n",my_id,vals[2]); 254 printf("%d :: xyt C(2d) =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.5))); 255 printf("%d :: xyt C(3d) =%g\n",my_id,vals[2]/(pow(1.0*vals[5],1.6667))); 256 printf("%d :: min xyt_n =%d\n",my_id,vals[3]); 257 printf("%d :: max xyt_n =%d\n",my_id,vals[4]); 258 printf("%d :: avg xyt_n =%g\n",my_id,1.0*vals[5]/num_nodes); 259 printf("%d :: tot xyt_n =%d\n",my_id,vals[5]); 260 printf("%d :: min xyt_buf=%d\n",my_id,vals[6]); 261 printf("%d :: max xyt_buf=%d\n",my_id,vals[7]); 262 printf("%d :: avg xyt_buf=%g\n",my_id,1.0*vals[8]/num_nodes); 263 printf("%d :: min xyt_slv=%g\n",my_id,fvals[0]); 264 printf("%d :: max xyt_slv=%g\n",my_id,fvals[1]); 265 printf("%d :: avg xyt_slv=%g\n",my_id,fvals[2]/num_nodes); 266 } 267 268 return(0); 269 } 270 271 272 /*************************************xyt.c************************************ 273 Function: do_xyt_factor 274 275 Input : 276 Output: 277 Return: 278 Description: get A_local, local portion of global coarse matrix which 279 is a row dist. nxm matrix w/ n<m. 280 o my_ml holds address of ML struct associated w/A_local and coarse grid 281 o local2global holds global number of column i (i=0,...,m-1) 282 o local2global holds global number of row i (i=0,...,n-1) 283 o mylocmatvec performs A_local . vec_local (note that gs is performed using 284 gs_init/gop). 285 286 mylocmatvec = my_ml->Amat[grid_tag].matvec->external; 287 mylocmatvec (void :: void *data, double *in, double *out) 288 **************************************xyt.c***********************************/ 289 static 290 int 291 do_xyt_factor(xyt_ADT xyt_handle) 292 { 293 int flag; 294 295 296 flag=xyt_generate(xyt_handle); 297 return(flag); 298 } 299 300 301 /*************************************xyt.c************************************ 302 Function: 303 304 Input : 305 Output: 306 Return: 307 Description: 308 **************************************xyt.c***********************************/ 309 static 310 int 311 xyt_generate(xyt_ADT xyt_handle) 312 { 313 int i,j,k,idx; 314 int dim, col; 315 PetscScalar *u, *uu, *v, *z, *w, alpha, alpha_w; 316 int *segs; 317 int op[] = {GL_ADD,0}; 318 int off, len; 319 PetscScalar *x_ptr, *y_ptr; 320 int *iptr, flag; 321 int start=0, end, work; 322 int op2[] = {GL_MIN,0}; 323 gs_ADT gs_handle; 324 int *nsep, *lnsep, *fo; 325 int a_n=xyt_handle->mvi->n; 326 int a_m=xyt_handle->mvi->m; 327 int *a_local2global=xyt_handle->mvi->local2global; 328 int level; 329 int n, m; 330 int *xcol_sz, *xcol_indices, *stages; 331 PetscScalar **xcol_vals, *x; 332 int *ycol_sz, *ycol_indices; 333 PetscScalar **ycol_vals, *y; 334 int n_global; 335 int xt_nnz=0, xt_max_nnz=0; 336 int yt_nnz=0, yt_max_nnz=0; 337 int xt_zero_nnz =0; 338 int xt_zero_nnz_0=0; 339 int yt_zero_nnz =0; 340 int yt_zero_nnz_0=0; 341 PetscBLASInt i1 = 1; 342 PetscScalar dm1 = -1.0; 343 344 n=xyt_handle->mvi->n; 345 nsep=xyt_handle->info->nsep; 346 lnsep=xyt_handle->info->lnsep; 347 fo=xyt_handle->info->fo; 348 end=lnsep[0]; 349 level=xyt_handle->level; 350 gs_handle=xyt_handle->mvi->gs_handle; 351 352 /* is there a null space? */ 353 /* LATER add in ability to detect null space by checking alpha */ 354 for (i=0, j=0; i<=level; i++) 355 {j+=nsep[i];} 356 357 m = j-xyt_handle->ns; 358 if (m!=j) 359 {printf("xyt_generate() :: null space exists %d %d %d\n",m,j,xyt_handle->ns);} 360 361 error_msg_warning("xyt_generate() :: X(%d,%d)\n",n,m); 362 363 /* get and initialize storage for x local */ 364 /* note that x local is nxm and stored by columns */ 365 xcol_sz = (int*) malloc(m*sizeof(PetscInt)); 366 xcol_indices = (int*) malloc((2*m+1)*sizeof(int)); 367 xcol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *)); 368 for (i=j=0; i<m; i++, j+=2) 369 { 370 xcol_indices[j]=xcol_indices[j+1]=xcol_sz[i]=-1; 371 xcol_vals[i] = NULL; 372 } 373 xcol_indices[j]=-1; 374 375 /* get and initialize storage for y local */ 376 /* note that y local is nxm and stored by columns */ 377 ycol_sz = (int*) malloc(m*sizeof(PetscInt)); 378 ycol_indices = (int*) malloc((2*m+1)*sizeof(int)); 379 ycol_vals = (PetscScalar **) malloc(m*sizeof(PetscScalar *)); 380 for (i=j=0; i<m; i++, j+=2) 381 { 382 ycol_indices[j]=ycol_indices[j+1]=ycol_sz[i]=-1; 383 ycol_vals[i] = NULL; 384 } 385 ycol_indices[j]=-1; 386 387 /* size of separators for each sub-hc working from bottom of tree to top */ 388 /* this looks like nsep[]=segments */ 389 stages = (int*) malloc((level+1)*sizeof(PetscInt)); 390 segs = (int*) malloc((level+1)*sizeof(PetscInt)); 391 ivec_zero(stages,level+1); 392 ivec_copy(segs,nsep,level+1); 393 for (i=0; i<level; i++) 394 {segs[i+1] += segs[i];} 395 stages[0] = segs[0]; 396 397 /* temporary vectors */ 398 u = (PetscScalar *) malloc(n*sizeof(PetscScalar)); 399 z = (PetscScalar *) malloc(n*sizeof(PetscScalar)); 400 v = (PetscScalar *) malloc(a_m*sizeof(PetscScalar)); 401 uu = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 402 w = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 403 404 /* extra nnz due to replication of vertices across separators */ 405 for (i=1, j=0; i<=level; i++) 406 {j+=nsep[i];} 407 408 /* storage for sparse x values */ 409 n_global = xyt_handle->info->n_global; 410 xt_max_nnz = yt_max_nnz = (int)(2.5*pow(1.0*n_global,1.6667) + j*n/2)/num_nodes; 411 x = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar)); 412 y = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar)); 413 414 /* LATER - can embed next sep to fire in gs */ 415 /* time to make the donuts - generate X factor */ 416 for (dim=i=j=0;i<m;i++) 417 { 418 /* time to move to the next level? */ 419 while (i==segs[dim]) 420 { 421 #ifdef SAFE 422 if (dim==level) 423 {error_msg_fatal("dim about to exceed level\n"); break;} 424 #endif 425 426 stages[dim++]=i; 427 end+=lnsep[dim]; 428 } 429 stages[dim]=i; 430 431 /* which column are we firing? */ 432 /* i.e. set v_l */ 433 /* use new seps and do global min across hc to determine which one to fire */ 434 (start<end) ? (col=fo[start]) : (col=INT_MAX); 435 giop_hc(&col,&work,1,op2,dim); 436 437 /* shouldn't need this */ 438 if (col==INT_MAX) 439 { 440 error_msg_warning("hey ... col==INT_MAX??\n"); 441 continue; 442 } 443 444 /* do I own it? I should */ 445 rvec_zero(v ,a_m); 446 if (col==fo[start]) 447 { 448 start++; 449 idx=ivec_linear_search(col, a_local2global, a_n); 450 if (idx!=-1) 451 {v[idx] = 1.0; j++;} 452 else 453 {error_msg_fatal("NOT FOUND!\n");} 454 } 455 else 456 { 457 idx=ivec_linear_search(col, a_local2global, a_m); 458 if (idx!=-1) 459 {v[idx] = 1.0;} 460 } 461 462 /* perform u = A.v_l */ 463 rvec_zero(u,n); 464 do_matvec(xyt_handle->mvi,v,u); 465 466 /* uu = X^T.u_l (local portion) */ 467 /* technically only need to zero out first i entries */ 468 /* later turn this into an XYT_solve call ? */ 469 rvec_zero(uu,m); 470 y_ptr=y; 471 iptr = ycol_indices; 472 for (k=0; k<i; k++) 473 { 474 off = *iptr++; 475 len = *iptr++; 476 477 uu[k] = BLASdot_(&len,u+off,&i1,y_ptr,&i1); 478 y_ptr+=len; 479 } 480 481 /* uu = X^T.u_l (comm portion) */ 482 ssgl_radd (uu, w, dim, stages); 483 484 /* z = X.uu */ 485 rvec_zero(z,n); 486 x_ptr=x; 487 iptr = xcol_indices; 488 for (k=0; k<i; k++) 489 { 490 off = *iptr++; 491 len = *iptr++; 492 493 BLASaxpy_(&len,&uu[k],x_ptr,&i1,z+off,&i1); 494 x_ptr+=len; 495 } 496 497 /* compute v_l = v_l - z */ 498 rvec_zero(v+a_n,a_m-a_n); 499 BLASaxpy_(&n,&dm1,z,&i1,v,&i1); 500 501 /* compute u_l = A.v_l */ 502 if (a_n!=a_m) 503 {gs_gop_hc(gs_handle,v,"+\0",dim);} 504 rvec_zero(u,n); 505 do_matvec(xyt_handle->mvi,v,u); 506 507 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - local portion */ 508 alpha = BLASdot_(&n,u,&i1,u,&i1); 509 /* compute sqrt(alpha) = sqrt(u_l^T.u_l) - comm portion */ 510 grop_hc(&alpha, &alpha_w, 1, op, dim); 511 512 alpha = (PetscScalar) sqrt((double)alpha); 513 514 /* check for small alpha */ 515 /* LATER use this to detect and determine null space */ 516 if (fabs(alpha)<1.0e-14) 517 {error_msg_fatal("bad alpha! %g\n",alpha);} 518 519 /* compute v_l = v_l/sqrt(alpha) */ 520 rvec_scale(v,1.0/alpha,n); 521 rvec_scale(u,1.0/alpha,n); 522 523 /* add newly generated column, v_l, to X */ 524 flag = 1; 525 off=len=0; 526 for (k=0; k<n; k++) 527 { 528 if (v[k]!=0.0) 529 { 530 len=k; 531 if (flag) 532 {off=k; flag=0;} 533 } 534 } 535 536 len -= (off-1); 537 538 if (len>0) 539 { 540 if ((xt_nnz+len)>xt_max_nnz) 541 { 542 error_msg_warning("increasing space for X by 2x!\n"); 543 xt_max_nnz *= 2; 544 x_ptr = (PetscScalar *) malloc(xt_max_nnz*sizeof(PetscScalar)); 545 rvec_copy(x_ptr,x,xt_nnz); 546 free(x); 547 x = x_ptr; 548 x_ptr+=xt_nnz; 549 } 550 xt_nnz += len; 551 rvec_copy(x_ptr,v+off,len); 552 553 /* keep track of number of zeros */ 554 if (dim) 555 { 556 for (k=0; k<len; k++) 557 { 558 if (x_ptr[k]==0.0) 559 {xt_zero_nnz++;} 560 } 561 } 562 else 563 { 564 for (k=0; k<len; k++) 565 { 566 if (x_ptr[k]==0.0) 567 {xt_zero_nnz_0++;} 568 } 569 } 570 xcol_indices[2*i] = off; 571 xcol_sz[i] = xcol_indices[2*i+1] = len; 572 xcol_vals[i] = x_ptr; 573 } 574 else 575 { 576 xcol_indices[2*i] = 0; 577 xcol_sz[i] = xcol_indices[2*i+1] = 0; 578 xcol_vals[i] = x_ptr; 579 } 580 581 582 /* add newly generated column, u_l, to Y */ 583 flag = 1; 584 off=len=0; 585 for (k=0; k<n; k++) 586 { 587 if (u[k]!=0.0) 588 { 589 len=k; 590 if (flag) 591 {off=k; flag=0;} 592 } 593 } 594 595 len -= (off-1); 596 597 if (len>0) 598 { 599 if ((yt_nnz+len)>yt_max_nnz) 600 { 601 error_msg_warning("increasing space for Y by 2x!\n"); 602 yt_max_nnz *= 2; 603 y_ptr = (PetscScalar *) malloc(yt_max_nnz*sizeof(PetscScalar)); 604 rvec_copy(y_ptr,y,yt_nnz); 605 free(y); 606 y = y_ptr; 607 y_ptr+=yt_nnz; 608 } 609 yt_nnz += len; 610 rvec_copy(y_ptr,u+off,len); 611 612 /* keep track of number of zeros */ 613 if (dim) 614 { 615 for (k=0; k<len; k++) 616 { 617 if (y_ptr[k]==0.0) 618 {yt_zero_nnz++;} 619 } 620 } 621 else 622 { 623 for (k=0; k<len; k++) 624 { 625 if (y_ptr[k]==0.0) 626 {yt_zero_nnz_0++;} 627 } 628 } 629 ycol_indices[2*i] = off; 630 ycol_sz[i] = ycol_indices[2*i+1] = len; 631 ycol_vals[i] = y_ptr; 632 } 633 else 634 { 635 ycol_indices[2*i] = 0; 636 ycol_sz[i] = ycol_indices[2*i+1] = 0; 637 ycol_vals[i] = y_ptr; 638 } 639 } 640 641 /* close off stages for execution phase */ 642 while (dim!=level) 643 { 644 stages[dim++]=i; 645 error_msg_warning("disconnected!!! dim(%d)!=level(%d)\n",dim,level); 646 } 647 stages[dim]=i; 648 649 xyt_handle->info->n=xyt_handle->mvi->n; 650 xyt_handle->info->m=m; 651 xyt_handle->info->nnz=xt_nnz + yt_nnz; 652 xyt_handle->info->max_nnz=xt_max_nnz + yt_max_nnz; 653 xyt_handle->info->msg_buf_sz=stages[level]-stages[0]; 654 xyt_handle->info->solve_uu = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 655 xyt_handle->info->solve_w = (PetscScalar *) malloc(m*sizeof(PetscScalar)); 656 xyt_handle->info->x=x; 657 xyt_handle->info->xcol_vals=xcol_vals; 658 xyt_handle->info->xcol_sz=xcol_sz; 659 xyt_handle->info->xcol_indices=xcol_indices; 660 xyt_handle->info->stages=stages; 661 xyt_handle->info->y=y; 662 xyt_handle->info->ycol_vals=ycol_vals; 663 xyt_handle->info->ycol_sz=ycol_sz; 664 xyt_handle->info->ycol_indices=ycol_indices; 665 666 free(segs); 667 free(u); 668 free(v); 669 free(uu); 670 free(z); 671 free(w); 672 673 return(0); 674 } 675 676 677 /*************************************xyt.c************************************ 678 Function: 679 680 Input : 681 Output: 682 Return: 683 Description: 684 **************************************xyt.c***********************************/ 685 static 686 void 687 do_xyt_solve(xyt_ADT xyt_handle, PetscScalar *uc) 688 { 689 int off, len, *iptr; 690 int level =xyt_handle->level; 691 int n =xyt_handle->info->n; 692 int m =xyt_handle->info->m; 693 int *stages =xyt_handle->info->stages; 694 int *xcol_indices=xyt_handle->info->xcol_indices; 695 int *ycol_indices=xyt_handle->info->ycol_indices; 696 PetscScalar *x_ptr, *y_ptr, *uu_ptr; 697 PetscScalar *solve_uu=xyt_handle->info->solve_uu; 698 PetscScalar *solve_w =xyt_handle->info->solve_w; 699 PetscScalar *x =xyt_handle->info->x; 700 PetscScalar *y =xyt_handle->info->y; 701 PetscBLASInt i1 = 1; 702 703 704 uu_ptr=solve_uu; 705 rvec_zero(uu_ptr,m); 706 707 /* x = X.Y^T.b */ 708 /* uu = Y^T.b */ 709 for (y_ptr=y,iptr=ycol_indices; *iptr!=-1; y_ptr+=len) 710 { 711 off=*iptr++; len=*iptr++; 712 *uu_ptr++ = BLASdot_(&len,uc+off,&i1,y_ptr,&i1); 713 } 714 715 /* comunication of beta */ 716 uu_ptr=solve_uu; 717 if (level) {ssgl_radd(uu_ptr, solve_w, level, stages);} 718 719 rvec_zero(uc,n); 720 721 /* x = X.uu */ 722 for (x_ptr=x,iptr=xcol_indices; *iptr!=-1; x_ptr+=len) 723 { 724 off=*iptr++; len=*iptr++; 725 BLASaxpy_(&len,uu_ptr++,x_ptr,&i1,uc+off,&i1); 726 } 727 728 } 729 730 731 /*************************************Xyt.c************************************ 732 Function: check_init 733 734 Input : 735 Output: 736 Return: 737 Description: 738 **************************************xyt.c***********************************/ 739 static 740 void 741 check_init(void) 742 { 743 comm_init(); 744 745 746 } 747 748 749 /*************************************xyt.c************************************ 750 Function: check_handle() 751 752 Input : 753 Output: 754 Return: 755 Description: 756 **************************************xyt.c***********************************/ 757 static 758 void 759 check_handle(xyt_ADT xyt_handle) 760 { 761 #ifdef SAFE 762 int vals[2], work[2], op[] = {NON_UNIFORM,GL_MIN,GL_MAX}; 763 #endif 764 765 if (xyt_handle==NULL) 766 {error_msg_fatal("check_handle() :: bad handle :: NULL %d\n",xyt_handle);} 767 768 #ifdef SAFE 769 vals[0]=vals[1]=xyt_handle->id; 770 giop(vals,work,sizeof(op)/sizeof(op[0])-1,op); 771 if ((vals[0]!=vals[1])||(xyt_handle->id<=0)) 772 {error_msg_fatal("check_handle() :: bad handle :: id mismatch min/max %d/%d %d\n", 773 vals[0],vals[1], xyt_handle->id);} 774 #endif 775 776 } 777 778 779 /*************************************xyt.c************************************ 780 Function: det_separators 781 782 Input : 783 Output: 784 Return: 785 Description: 786 det_separators(xyt_handle, local2global, n, m, mylocmatvec, grid_data); 787 **************************************xyt.c***********************************/ 788 static 789 void 790 det_separators(xyt_ADT xyt_handle) 791 { 792 int i, ct, id; 793 int mask, edge, *iptr; 794 int *dir, *used; 795 int sum[4], w[4]; 796 PetscScalar rsum[4], rw[4]; 797 int op[] = {GL_ADD,0}; 798 PetscScalar *lhs, *rhs; 799 int *nsep, *lnsep, *fo, nfo=0; 800 gs_ADT gs_handle=xyt_handle->mvi->gs_handle; 801 int *local2global=xyt_handle->mvi->local2global; 802 int n=xyt_handle->mvi->n; 803 int m=xyt_handle->mvi->m; 804 int level=xyt_handle->level; 805 int shared=FALSE; 806 807 dir = (int*)malloc(sizeof(PetscInt)*(level+1)); 808 nsep = (int*)malloc(sizeof(PetscInt)*(level+1)); 809 lnsep= (int*)malloc(sizeof(PetscInt)*(level+1)); 810 fo = (int*)malloc(sizeof(PetscInt)*(n+1)); 811 used = (int*)malloc(sizeof(PetscInt)*n); 812 813 ivec_zero(dir ,level+1); 814 ivec_zero(nsep ,level+1); 815 ivec_zero(lnsep,level+1); 816 ivec_set (fo ,-1,n+1); 817 ivec_zero(used,n); 818 819 lhs = (double*)malloc(sizeof(PetscScalar)*m); 820 rhs = (double*)malloc(sizeof(PetscScalar)*m); 821 822 /* determine the # of unique dof */ 823 rvec_zero(lhs,m); 824 rvec_set(lhs,1.0,n); 825 gs_gop_hc(gs_handle,lhs,"+\0",level); 826 error_msg_warning("done first gs_gop_hc\n"); 827 rvec_zero(rsum,2); 828 for (ct=i=0;i<n;i++) 829 { 830 if (lhs[i]!=0.0) 831 {rsum[0]+=1.0/lhs[i]; rsum[1]+=lhs[i];} 832 833 if (lhs[i]!=1.0) 834 { 835 shared=TRUE; 836 } 837 } 838 839 grop_hc(rsum,rw,2,op,level); 840 rsum[0]+=0.1; 841 rsum[1]+=0.1; 842 843 /* 844 if (!my_id) 845 { 846 printf("xyt n unique = %d (%g)\n",(int) rsum[0], rsum[0]); 847 printf("xyt n shared = %d (%g)\n",(int) rsum[1], rsum[1]); 848 } 849 */ 850 851 xyt_handle->info->n_global=xyt_handle->info->m_global=(int) rsum[0]; 852 xyt_handle->mvi->n_global =xyt_handle->mvi->m_global =(int) rsum[0]; 853 854 /* determine separator sets top down */ 855 if (shared) 856 { 857 /* solution is to do as in the symmetric shared case but then */ 858 /* pick the sub-hc with the most free dofs and do a mat-vec */ 859 /* and pick up the responses on the other sub-hc from the */ 860 /* initial separator set obtained from the symm. shared case */ 861 error_msg_fatal("shared dof separator determination not ready ... see hmt!!!\n"); 862 for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1) 863 { 864 /* set rsh of hc, fire, and collect lhs responses */ 865 (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m); 866 gs_gop_hc(gs_handle,lhs,"+\0",edge); 867 868 /* set lsh of hc, fire, and collect rhs responses */ 869 (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m); 870 gs_gop_hc(gs_handle,rhs,"+\0",edge); 871 872 for (i=0;i<n;i++) 873 { 874 if (id< mask) 875 { 876 if (lhs[i]!=0.0) 877 {lhs[i]=1.0;} 878 } 879 if (id>=mask) 880 { 881 if (rhs[i]!=0.0) 882 {rhs[i]=1.0;} 883 } 884 } 885 886 if (id< mask) 887 {gs_gop_hc(gs_handle,lhs,"+\0",edge-1);} 888 else 889 {gs_gop_hc(gs_handle,rhs,"+\0",edge-1);} 890 891 /* count number of dofs I own that have signal and not in sep set */ 892 rvec_zero(rsum,4); 893 for (ivec_zero(sum,4),ct=i=0;i<n;i++) 894 { 895 if (!used[i]) 896 { 897 /* number of unmarked dofs on node */ 898 ct++; 899 /* number of dofs to be marked on lhs hc */ 900 if (id< mask) 901 { 902 if (lhs[i]!=0.0) 903 {sum[0]++; rsum[0]+=1.0/lhs[i];} 904 } 905 /* number of dofs to be marked on rhs hc */ 906 if (id>=mask) 907 { 908 if (rhs[i]!=0.0) 909 {sum[1]++; rsum[1]+=1.0/rhs[i];} 910 } 911 } 912 } 913 914 /* go for load balance - choose half with most unmarked dofs, bias LHS */ 915 (id<mask) ? (sum[2]=ct) : (sum[3]=ct); 916 (id<mask) ? (rsum[2]=ct) : (rsum[3]=ct); 917 giop_hc(sum,w,4,op,edge); 918 grop_hc(rsum,rw,4,op,edge); 919 rsum[0]+=0.1; rsum[1]+=0.1; rsum[2]+=0.1; rsum[3]+=0.1; 920 921 if (id<mask) 922 { 923 /* mark dofs I own that have signal and not in sep set */ 924 for (ct=i=0;i<n;i++) 925 { 926 if ((!used[i])&&(lhs[i]!=0.0)) 927 { 928 ct++; nfo++; 929 930 if (nfo>n) 931 {error_msg_fatal("nfo about to exceed n\n");} 932 933 *--iptr = local2global[i]; 934 used[i]=edge; 935 } 936 } 937 if (ct>1) {ivec_sort(iptr,ct);} 938 939 lnsep[edge]=ct; 940 nsep[edge]=(int) rsum[0]; 941 dir [edge]=LEFT; 942 } 943 944 if (id>=mask) 945 { 946 /* mark dofs I own that have signal and not in sep set */ 947 for (ct=i=0;i<n;i++) 948 { 949 if ((!used[i])&&(rhs[i]!=0.0)) 950 { 951 ct++; nfo++; 952 953 if (nfo>n) 954 {error_msg_fatal("nfo about to exceed n\n");} 955 956 *--iptr = local2global[i]; 957 used[i]=edge; 958 } 959 } 960 if (ct>1) {ivec_sort(iptr,ct);} 961 962 lnsep[edge]=ct; 963 nsep[edge]= (int) rsum[1]; 964 dir [edge]=RIGHT; 965 } 966 967 /* LATER or we can recur on these to order seps at this level */ 968 /* do we need full set of separators for this? */ 969 970 /* fold rhs hc into lower */ 971 if (id>=mask) 972 {id-=mask;} 973 } 974 } 975 else 976 { 977 for (iptr=fo+n,id=my_id,mask=num_nodes>>1,edge=level;edge>0;edge--,mask>>=1) 978 { 979 /* set rsh of hc, fire, and collect lhs responses */ 980 (id<mask) ? rvec_zero(lhs,m) : rvec_set(lhs,1.0,m); 981 gs_gop_hc(gs_handle,lhs,"+\0",edge); 982 983 /* set lsh of hc, fire, and collect rhs responses */ 984 (id<mask) ? rvec_set(rhs,1.0,m) : rvec_zero(rhs,m); 985 gs_gop_hc(gs_handle,rhs,"+\0",edge); 986 987 /* count number of dofs I own that have signal and not in sep set */ 988 for (ivec_zero(sum,4),ct=i=0;i<n;i++) 989 { 990 if (!used[i]) 991 { 992 /* number of unmarked dofs on node */ 993 ct++; 994 /* number of dofs to be marked on lhs hc */ 995 if ((id< mask)&&(lhs[i]!=0.0)) {sum[0]++;} 996 /* number of dofs to be marked on rhs hc */ 997 if ((id>=mask)&&(rhs[i]!=0.0)) {sum[1]++;} 998 } 999 } 1000 1001 /* for the non-symmetric case we need separators of width 2 */ 1002 /* so take both sides */ 1003 (id<mask) ? (sum[2]=ct) : (sum[3]=ct); 1004 giop_hc(sum,w,4,op,edge); 1005 1006 ct=0; 1007 if (id<mask) 1008 { 1009 /* mark dofs I own that have signal and not in sep set */ 1010 for (i=0;i<n;i++) 1011 { 1012 if ((!used[i])&&(lhs[i]!=0.0)) 1013 { 1014 ct++; nfo++; 1015 *--iptr = local2global[i]; 1016 used[i]=edge; 1017 } 1018 } 1019 /* LSH hc summation of ct should be sum[0] */ 1020 } 1021 else 1022 { 1023 /* mark dofs I own that have signal and not in sep set */ 1024 for (i=0;i<n;i++) 1025 { 1026 if ((!used[i])&&(rhs[i]!=0.0)) 1027 { 1028 ct++; nfo++; 1029 *--iptr = local2global[i]; 1030 used[i]=edge; 1031 } 1032 } 1033 /* RSH hc summation of ct should be sum[1] */ 1034 } 1035 1036 if (ct>1) {ivec_sort(iptr,ct);} 1037 lnsep[edge]=ct; 1038 nsep[edge]=sum[0]+sum[1]; 1039 dir [edge]=BOTH; 1040 1041 /* LATER or we can recur on these to order seps at this level */ 1042 /* do we need full set of separators for this? */ 1043 1044 /* fold rhs hc into lower */ 1045 if (id>=mask) 1046 {id-=mask;} 1047 } 1048 } 1049 1050 /* level 0 is on processor case - so mark the remainder */ 1051 for (ct=i=0;i<n;i++) 1052 { 1053 if (!used[i]) 1054 { 1055 ct++; nfo++; 1056 *--iptr = local2global[i]; 1057 used[i]=edge; 1058 } 1059 } 1060 if (ct>1) {ivec_sort(iptr,ct);} 1061 lnsep[edge]=ct; 1062 nsep [edge]=ct; 1063 dir [edge]=BOTH; 1064 1065 xyt_handle->info->nsep=nsep; 1066 xyt_handle->info->lnsep=lnsep; 1067 xyt_handle->info->fo=fo; 1068 xyt_handle->info->nfo=nfo; 1069 1070 free(dir); 1071 free(lhs); 1072 free(rhs); 1073 free(used); 1074 1075 } 1076 1077 1078 /*************************************xyt.c************************************ 1079 Function: set_mvi 1080 1081 Input : 1082 Output: 1083 Return: 1084 Description: 1085 **************************************xyt.c***********************************/ 1086 static 1087 mv_info *set_mvi(int *local2global, int n, int m, void *matvec, void *grid_data) 1088 { 1089 mv_info *mvi; 1090 1091 1092 mvi = (mv_info*)malloc(sizeof(mv_info)); 1093 mvi->n=n; 1094 mvi->m=m; 1095 mvi->n_global=-1; 1096 mvi->m_global=-1; 1097 mvi->local2global=(int*)malloc((m+1)*sizeof(PetscInt)); 1098 ivec_copy(mvi->local2global,local2global,m); 1099 mvi->local2global[m] = INT_MAX; 1100 mvi->matvec=(PetscErrorCode (*)(mv_info*,PetscScalar*,PetscScalar*))matvec; 1101 mvi->grid_data=grid_data; 1102 1103 /* set xyt communication handle to perform restricted matvec */ 1104 mvi->gs_handle = gs_init(local2global, m, num_nodes); 1105 1106 return(mvi); 1107 } 1108 1109 1110 /*************************************xyt.c************************************ 1111 Function: set_mvi 1112 1113 Input : 1114 Output: 1115 Return: 1116 Description: 1117 1118 computes u = A.v 1119 do_matvec(xyt_handle->mvi,v,u); 1120 **************************************xyt.c***********************************/ 1121 static void do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u) 1122 { 1123 A->matvec((mv_info*)A->grid_data,v,u); 1124 } 1125 1126 1127 1128