1 2 #include <petsc/private/matimpl.h> 3 4 #define DEFAULT_STASH_SIZE 10000 5 6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*); 7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*); 8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*); 9 #if !defined(PETSC_HAVE_MPIUNI) 10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*); 11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*); 12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*); 13 #endif 14 15 /* 16 MatStashCreate_Private - Creates a stash,currently used for all the parallel 17 matrix implementations. The stash is where elements of a matrix destined 18 to be stored on other processors are kept until matrix assembly is done. 19 20 This is a simple minded stash. Simply adds entries to end of stash. 21 22 Input Parameters: 23 comm - communicator, required for scatters. 24 bs - stash block size. used when stashing blocks of values 25 26 Output Parameters: 27 stash - the newly created stash 28 */ 29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash) 30 { 31 PetscInt max,*opt,nopt,i; 32 PetscBool flg; 33 34 PetscFunctionBegin; 35 /* Require 2 tags,get the second using PetscCommGetNewTag() */ 36 stash->comm = comm; 37 38 PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag1)); 39 PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag2)); 40 PetscCallMPI(MPI_Comm_size(stash->comm,&stash->size)); 41 PetscCallMPI(MPI_Comm_rank(stash->comm,&stash->rank)); 42 PetscCall(PetscMalloc1(2*stash->size,&stash->flg_v)); 43 for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1; 44 45 nopt = stash->size; 46 PetscCall(PetscMalloc1(nopt,&opt)); 47 PetscCall(PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg)); 48 if (flg) { 49 if (nopt == 1) max = opt[0]; 50 else if (nopt == stash->size) max = opt[stash->rank]; 51 else if (stash->rank < nopt) max = opt[stash->rank]; 52 else max = 0; /* Use default */ 53 stash->umax = max; 54 } else { 55 stash->umax = 0; 56 } 57 PetscCall(PetscFree(opt)); 58 if (bs <= 0) bs = 1; 59 60 stash->bs = bs; 61 stash->nmax = 0; 62 stash->oldnmax = 0; 63 stash->n = 0; 64 stash->reallocs = -1; 65 stash->space_head = NULL; 66 stash->space = NULL; 67 68 stash->send_waits = NULL; 69 stash->recv_waits = NULL; 70 stash->send_status = NULL; 71 stash->nsends = 0; 72 stash->nrecvs = 0; 73 stash->svalues = NULL; 74 stash->rvalues = NULL; 75 stash->rindices = NULL; 76 stash->nprocessed = 0; 77 stash->reproduce = PETSC_FALSE; 78 stash->blocktype = MPI_DATATYPE_NULL; 79 80 PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL)); 81 #if !defined(PETSC_HAVE_MPIUNI) 82 flg = PETSC_FALSE; 83 PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL)); 84 if (!flg) { 85 stash->ScatterBegin = MatStashScatterBegin_BTS; 86 stash->ScatterGetMesg = MatStashScatterGetMesg_BTS; 87 stash->ScatterEnd = MatStashScatterEnd_BTS; 88 stash->ScatterDestroy = MatStashScatterDestroy_BTS; 89 } else { 90 #endif 91 stash->ScatterBegin = MatStashScatterBegin_Ref; 92 stash->ScatterGetMesg = MatStashScatterGetMesg_Ref; 93 stash->ScatterEnd = MatStashScatterEnd_Ref; 94 stash->ScatterDestroy = NULL; 95 #if !defined(PETSC_HAVE_MPIUNI) 96 } 97 #endif 98 PetscFunctionReturn(0); 99 } 100 101 /* 102 MatStashDestroy_Private - Destroy the stash 103 */ 104 PetscErrorCode MatStashDestroy_Private(MatStash *stash) 105 { 106 PetscFunctionBegin; 107 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 108 if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash)); 109 110 stash->space = NULL; 111 112 PetscCall(PetscFree(stash->flg_v)); 113 PetscFunctionReturn(0); 114 } 115 116 /* 117 MatStashScatterEnd_Private - This is called as the final stage of 118 scatter. The final stages of message passing is done here, and 119 all the memory used for message passing is cleaned up. This 120 routine also resets the stash, and deallocates the memory used 121 for the stash. It also keeps track of the current memory usage 122 so that the same value can be used the next time through. 123 */ 124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash) 125 { 126 PetscFunctionBegin; 127 PetscCall((*stash->ScatterEnd)(stash)); 128 PetscFunctionReturn(0); 129 } 130 131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash) 132 { 133 PetscInt nsends=stash->nsends,bs2,oldnmax,i; 134 MPI_Status *send_status; 135 136 PetscFunctionBegin; 137 for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1; 138 /* wait on sends */ 139 if (nsends) { 140 PetscCall(PetscMalloc1(2*nsends,&send_status)); 141 PetscCallMPI(MPI_Waitall(2*nsends,stash->send_waits,send_status)); 142 PetscCall(PetscFree(send_status)); 143 } 144 145 /* Now update nmaxold to be app 10% more than max n used, this way the 146 wastage of space is reduced the next time this stash is used. 147 Also update the oldmax, only if it increases */ 148 if (stash->n) { 149 bs2 = stash->bs*stash->bs; 150 oldnmax = ((int)(stash->n * 1.1) + 5)*bs2; 151 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 152 } 153 154 stash->nmax = 0; 155 stash->n = 0; 156 stash->reallocs = -1; 157 stash->nprocessed = 0; 158 159 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 160 161 stash->space = NULL; 162 163 PetscCall(PetscFree(stash->send_waits)); 164 PetscCall(PetscFree(stash->recv_waits)); 165 PetscCall(PetscFree2(stash->svalues,stash->sindices)); 166 PetscCall(PetscFree(stash->rvalues[0])); 167 PetscCall(PetscFree(stash->rvalues)); 168 PetscCall(PetscFree(stash->rindices[0])); 169 PetscCall(PetscFree(stash->rindices)); 170 PetscFunctionReturn(0); 171 } 172 173 /* 174 MatStashGetInfo_Private - Gets the relevant statistics of the stash 175 176 Input Parameters: 177 stash - the stash 178 nstash - the size of the stash. Indicates the number of values stored. 179 reallocs - the number of additional mallocs incurred. 180 181 */ 182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs) 183 { 184 PetscInt bs2 = stash->bs*stash->bs; 185 186 PetscFunctionBegin; 187 if (nstash) *nstash = stash->n*bs2; 188 if (reallocs) { 189 if (stash->reallocs < 0) *reallocs = 0; 190 else *reallocs = stash->reallocs; 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /* 196 MatStashSetInitialSize_Private - Sets the initial size of the stash 197 198 Input Parameters: 199 stash - the stash 200 max - the value that is used as the max size of the stash. 201 this value is used while allocating memory. 202 */ 203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max) 204 { 205 PetscFunctionBegin; 206 stash->umax = max; 207 PetscFunctionReturn(0); 208 } 209 210 /* MatStashExpand_Private - Expand the stash. This function is called 211 when the space in the stash is not sufficient to add the new values 212 being inserted into the stash. 213 214 Input Parameters: 215 stash - the stash 216 incr - the minimum increase requested 217 218 Notes: 219 This routine doubles the currently used memory. 220 */ 221 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr) 222 { 223 PetscInt newnmax,bs2= stash->bs*stash->bs; 224 225 PetscFunctionBegin; 226 /* allocate a larger stash */ 227 if (!stash->oldnmax && !stash->nmax) { /* new stash */ 228 if (stash->umax) newnmax = stash->umax/bs2; 229 else newnmax = DEFAULT_STASH_SIZE/bs2; 230 } else if (!stash->nmax) { /* resuing stash */ 231 if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2; 232 else newnmax = stash->oldnmax/bs2; 233 } else newnmax = stash->nmax*2; 234 if (newnmax < (stash->nmax + incr)) newnmax += 2*incr; 235 236 /* Get a MatStashSpace and attach it to stash */ 237 PetscCall(PetscMatStashSpaceGet(bs2,newnmax,&stash->space)); 238 if (!stash->space_head) { /* new stash or resuing stash->oldnmax */ 239 stash->space_head = stash->space; 240 } 241 242 stash->reallocs++; 243 stash->nmax = newnmax; 244 PetscFunctionReturn(0); 245 } 246 /* 247 MatStashValuesRow_Private - inserts values into the stash. This function 248 expects the values to be roworiented. Multiple columns belong to the same row 249 can be inserted with a single call to this function. 250 251 Input Parameters: 252 stash - the stash 253 row - the global row correspoiding to the values 254 n - the number of elements inserted. All elements belong to the above row. 255 idxn - the global column indices corresponding to each of the values. 256 values - the values inserted 257 */ 258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries) 259 { 260 PetscInt i,k,cnt = 0; 261 PetscMatStashSpace space=stash->space; 262 263 PetscFunctionBegin; 264 /* Check and see if we have sufficient memory */ 265 if (!space || space->local_remaining < n) { 266 PetscCall(MatStashExpand_Private(stash,n)); 267 } 268 space = stash->space; 269 k = space->local_used; 270 for (i=0; i<n; i++) { 271 if (ignorezeroentries && values && values[i] == 0.0) continue; 272 space->idx[k] = row; 273 space->idy[k] = idxn[i]; 274 space->val[k] = values ? values[i] : 0.0; 275 k++; 276 cnt++; 277 } 278 stash->n += cnt; 279 space->local_used += cnt; 280 space->local_remaining -= cnt; 281 PetscFunctionReturn(0); 282 } 283 284 /* 285 MatStashValuesCol_Private - inserts values into the stash. This function 286 expects the values to be columnoriented. Multiple columns belong to the same row 287 can be inserted with a single call to this function. 288 289 Input Parameters: 290 stash - the stash 291 row - the global row correspoiding to the values 292 n - the number of elements inserted. All elements belong to the above row. 293 idxn - the global column indices corresponding to each of the values. 294 values - the values inserted 295 stepval - the consecutive values are sepated by a distance of stepval. 296 this happens because the input is columnoriented. 297 */ 298 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries) 299 { 300 PetscInt i,k,cnt = 0; 301 PetscMatStashSpace space=stash->space; 302 303 PetscFunctionBegin; 304 /* Check and see if we have sufficient memory */ 305 if (!space || space->local_remaining < n) { 306 PetscCall(MatStashExpand_Private(stash,n)); 307 } 308 space = stash->space; 309 k = space->local_used; 310 for (i=0; i<n; i++) { 311 if (ignorezeroentries && values && values[i*stepval] == 0.0) continue; 312 space->idx[k] = row; 313 space->idy[k] = idxn[i]; 314 space->val[k] = values ? values[i*stepval] : 0.0; 315 k++; 316 cnt++; 317 } 318 stash->n += cnt; 319 space->local_used += cnt; 320 space->local_remaining -= cnt; 321 PetscFunctionReturn(0); 322 } 323 324 /* 325 MatStashValuesRowBlocked_Private - inserts blocks of values into the stash. 326 This function expects the values to be roworiented. Multiple columns belong 327 to the same block-row can be inserted with a single call to this function. 328 This function extracts the sub-block of values based on the dimensions of 329 the original input block, and the row,col values corresponding to the blocks. 330 331 Input Parameters: 332 stash - the stash 333 row - the global block-row correspoiding to the values 334 n - the number of elements inserted. All elements belong to the above row. 335 idxn - the global block-column indices corresponding to each of the blocks of 336 values. Each block is of size bs*bs. 337 values - the values inserted 338 rmax - the number of block-rows in the original block. 339 cmax - the number of block-columns on the original block. 340 idx - the index of the current block-row in the original block. 341 */ 342 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx) 343 { 344 PetscInt i,j,k,bs2,bs=stash->bs,l; 345 const PetscScalar *vals; 346 PetscScalar *array; 347 PetscMatStashSpace space=stash->space; 348 349 PetscFunctionBegin; 350 if (!space || space->local_remaining < n) { 351 PetscCall(MatStashExpand_Private(stash,n)); 352 } 353 space = stash->space; 354 l = space->local_used; 355 bs2 = bs*bs; 356 for (i=0; i<n; i++) { 357 space->idx[l] = row; 358 space->idy[l] = idxn[i]; 359 /* Now copy over the block of values. Store the values column oriented. 360 This enables inserting multiple blocks belonging to a row with a single 361 funtion call */ 362 array = space->val + bs2*l; 363 vals = values + idx*bs2*n + bs*i; 364 for (j=0; j<bs; j++) { 365 for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0; 366 array++; 367 vals += cmax*bs; 368 } 369 l++; 370 } 371 stash->n += n; 372 space->local_used += n; 373 space->local_remaining -= n; 374 PetscFunctionReturn(0); 375 } 376 377 /* 378 MatStashValuesColBlocked_Private - inserts blocks of values into the stash. 379 This function expects the values to be roworiented. Multiple columns belong 380 to the same block-row can be inserted with a single call to this function. 381 This function extracts the sub-block of values based on the dimensions of 382 the original input block, and the row,col values corresponding to the blocks. 383 384 Input Parameters: 385 stash - the stash 386 row - the global block-row correspoiding to the values 387 n - the number of elements inserted. All elements belong to the above row. 388 idxn - the global block-column indices corresponding to each of the blocks of 389 values. Each block is of size bs*bs. 390 values - the values inserted 391 rmax - the number of block-rows in the original block. 392 cmax - the number of block-columns on the original block. 393 idx - the index of the current block-row in the original block. 394 */ 395 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx) 396 { 397 PetscInt i,j,k,bs2,bs=stash->bs,l; 398 const PetscScalar *vals; 399 PetscScalar *array; 400 PetscMatStashSpace space=stash->space; 401 402 PetscFunctionBegin; 403 if (!space || space->local_remaining < n) { 404 PetscCall(MatStashExpand_Private(stash,n)); 405 } 406 space = stash->space; 407 l = space->local_used; 408 bs2 = bs*bs; 409 for (i=0; i<n; i++) { 410 space->idx[l] = row; 411 space->idy[l] = idxn[i]; 412 /* Now copy over the block of values. Store the values column oriented. 413 This enables inserting multiple blocks belonging to a row with a single 414 funtion call */ 415 array = space->val + bs2*l; 416 vals = values + idx*bs2*n + bs*i; 417 for (j=0; j<bs; j++) { 418 for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0; 419 array += bs; 420 vals += rmax*bs; 421 } 422 l++; 423 } 424 stash->n += n; 425 space->local_used += n; 426 space->local_remaining -= n; 427 PetscFunctionReturn(0); 428 } 429 /* 430 MatStashScatterBegin_Private - Initiates the transfer of values to the 431 correct owners. This function goes through the stash, and check the 432 owners of each stashed value, and sends the values off to the owner 433 processors. 434 435 Input Parameters: 436 stash - the stash 437 owners - an array of size 'no-of-procs' which gives the ownership range 438 for each node. 439 440 Notes: 441 The 'owners' array in the cased of the blocked-stash has the 442 ranges specified blocked global indices, and for the regular stash in 443 the proper global indices. 444 */ 445 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners) 446 { 447 PetscFunctionBegin; 448 PetscCall((*stash->ScatterBegin)(mat,stash,owners)); 449 PetscFunctionReturn(0); 450 } 451 452 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners) 453 { 454 PetscInt *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2; 455 PetscInt size=stash->size,nsends; 456 PetscInt count,*sindices,**rindices,i,j,idx,lastidx,l; 457 PetscScalar **rvalues,*svalues; 458 MPI_Comm comm = stash->comm; 459 MPI_Request *send_waits,*recv_waits,*recv_waits1,*recv_waits2; 460 PetscMPIInt *sizes,*nlengths,nreceives; 461 PetscInt *sp_idx,*sp_idy; 462 PetscScalar *sp_val; 463 PetscMatStashSpace space,space_next; 464 465 PetscFunctionBegin; 466 { /* make sure all processors are either in INSERTMODE or ADDMODE */ 467 InsertMode addv; 468 PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat))); 469 PetscCheckFalse(addv == (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added"); 470 mat->insertmode = addv; /* in case this processor had no cache */ 471 } 472 473 bs2 = stash->bs*stash->bs; 474 475 /* first count number of contributors to each processor */ 476 PetscCall(PetscCalloc1(size,&nlengths)); 477 PetscCall(PetscMalloc1(stash->n+1,&owner)); 478 479 i = j = 0; 480 lastidx = -1; 481 space = stash->space_head; 482 while (space) { 483 space_next = space->next; 484 sp_idx = space->idx; 485 for (l=0; l<space->local_used; l++) { 486 /* if indices are NOT locally sorted, need to start search at the beginning */ 487 if (lastidx > (idx = sp_idx[l])) j = 0; 488 lastidx = idx; 489 for (; j<size; j++) { 490 if (idx >= owners[j] && idx < owners[j+1]) { 491 nlengths[j]++; owner[i] = j; break; 492 } 493 } 494 i++; 495 } 496 space = space_next; 497 } 498 499 /* Now check what procs get messages - and compute nsends. */ 500 PetscCall(PetscCalloc1(size,&sizes)); 501 for (i=0, nsends=0; i<size; i++) { 502 if (nlengths[i]) { 503 sizes[i] = 1; nsends++; 504 } 505 } 506 507 {PetscMPIInt *onodes,*olengths; 508 /* Determine the number of messages to expect, their lengths, from from-ids */ 509 PetscCall(PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives)); 510 PetscCall(PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths)); 511 /* since clubbing row,col - lengths are multiplied by 2 */ 512 for (i=0; i<nreceives; i++) olengths[i] *=2; 513 PetscCall(PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1)); 514 /* values are size 'bs2' lengths (and remove earlier factor 2 */ 515 for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2; 516 PetscCall(PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2)); 517 PetscCall(PetscFree(onodes)); 518 PetscCall(PetscFree(olengths));} 519 520 /* do sends: 521 1) starts[i] gives the starting index in svalues for stuff going to 522 the ith processor 523 */ 524 PetscCall(PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices)); 525 PetscCall(PetscMalloc1(2*nsends,&send_waits)); 526 PetscCall(PetscMalloc2(size,&startv,size,&starti)); 527 /* use 2 sends the first with all_a, the next with all_i and all_j */ 528 startv[0] = 0; starti[0] = 0; 529 for (i=1; i<size; i++) { 530 startv[i] = startv[i-1] + nlengths[i-1]; 531 starti[i] = starti[i-1] + 2*nlengths[i-1]; 532 } 533 534 i = 0; 535 space = stash->space_head; 536 while (space) { 537 space_next = space->next; 538 sp_idx = space->idx; 539 sp_idy = space->idy; 540 sp_val = space->val; 541 for (l=0; l<space->local_used; l++) { 542 j = owner[i]; 543 if (bs2 == 1) { 544 svalues[startv[j]] = sp_val[l]; 545 } else { 546 PetscInt k; 547 PetscScalar *buf1,*buf2; 548 buf1 = svalues+bs2*startv[j]; 549 buf2 = space->val + bs2*l; 550 for (k=0; k<bs2; k++) buf1[k] = buf2[k]; 551 } 552 sindices[starti[j]] = sp_idx[l]; 553 sindices[starti[j]+nlengths[j]] = sp_idy[l]; 554 startv[j]++; 555 starti[j]++; 556 i++; 557 } 558 space = space_next; 559 } 560 startv[0] = 0; 561 for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1]; 562 563 for (i=0,count=0; i<size; i++) { 564 if (sizes[i]) { 565 PetscCallMPI(MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++)); 566 PetscCallMPI(MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++)); 567 } 568 } 569 #if defined(PETSC_USE_INFO) 570 PetscCall(PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends)); 571 for (i=0; i<size; i++) { 572 if (sizes[i]) { 573 PetscCall(PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt))))); 574 } 575 } 576 #endif 577 PetscCall(PetscFree(nlengths)); 578 PetscCall(PetscFree(owner)); 579 PetscCall(PetscFree2(startv,starti)); 580 PetscCall(PetscFree(sizes)); 581 582 /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */ 583 PetscCall(PetscMalloc1(2*nreceives,&recv_waits)); 584 585 for (i=0; i<nreceives; i++) { 586 recv_waits[2*i] = recv_waits1[i]; 587 recv_waits[2*i+1] = recv_waits2[i]; 588 } 589 stash->recv_waits = recv_waits; 590 591 PetscCall(PetscFree(recv_waits1)); 592 PetscCall(PetscFree(recv_waits2)); 593 594 stash->svalues = svalues; 595 stash->sindices = sindices; 596 stash->rvalues = rvalues; 597 stash->rindices = rindices; 598 stash->send_waits = send_waits; 599 stash->nsends = nsends; 600 stash->nrecvs = nreceives; 601 stash->reproduce_count = 0; 602 PetscFunctionReturn(0); 603 } 604 605 /* 606 MatStashScatterGetMesg_Private - This function waits on the receives posted 607 in the function MatStashScatterBegin_Private() and returns one message at 608 a time to the calling function. If no messages are left, it indicates this 609 by setting flg = 0, else it sets flg = 1. 610 611 Input Parameters: 612 stash - the stash 613 614 Output Parameters: 615 nvals - the number of entries in the current message. 616 rows - an array of row indices (or blocked indices) corresponding to the values 617 cols - an array of columnindices (or blocked indices) corresponding to the values 618 vals - the values 619 flg - 0 indicates no more message left, and the current call has no values associated. 620 1 indicates that the current call successfully received a message, and the 621 other output parameters nvals,rows,cols,vals are set appropriately. 622 */ 623 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg) 624 { 625 PetscFunctionBegin; 626 PetscCall((*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg)); 627 PetscFunctionReturn(0); 628 } 629 630 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg) 631 { 632 PetscMPIInt i,*flg_v = stash->flg_v,i1,i2; 633 PetscInt bs2; 634 MPI_Status recv_status; 635 PetscBool match_found = PETSC_FALSE; 636 637 PetscFunctionBegin; 638 *flg = 0; /* When a message is discovered this is reset to 1 */ 639 /* Return if no more messages to process */ 640 if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0); 641 642 bs2 = stash->bs*stash->bs; 643 /* If a matching pair of receives are found, process them, and return the data to 644 the calling function. Until then keep receiving messages */ 645 while (!match_found) { 646 if (stash->reproduce) { 647 i = stash->reproduce_count++; 648 PetscCallMPI(MPI_Wait(stash->recv_waits+i,&recv_status)); 649 } else { 650 PetscCallMPI(MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status)); 651 } 652 PetscCheckFalse(recv_status.MPI_SOURCE < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!"); 653 654 /* Now pack the received message into a structure which is usable by others */ 655 if (i % 2) { 656 PetscCallMPI(MPI_Get_count(&recv_status,MPIU_SCALAR,nvals)); 657 658 flg_v[2*recv_status.MPI_SOURCE] = i/2; 659 660 *nvals = *nvals/bs2; 661 } else { 662 PetscCallMPI(MPI_Get_count(&recv_status,MPIU_INT,nvals)); 663 664 flg_v[2*recv_status.MPI_SOURCE+1] = i/2; 665 666 *nvals = *nvals/2; /* This message has both row indices and col indices */ 667 } 668 669 /* Check if we have both messages from this proc */ 670 i1 = flg_v[2*recv_status.MPI_SOURCE]; 671 i2 = flg_v[2*recv_status.MPI_SOURCE+1]; 672 if (i1 != -1 && i2 != -1) { 673 *rows = stash->rindices[i2]; 674 *cols = *rows + *nvals; 675 *vals = stash->rvalues[i1]; 676 *flg = 1; 677 stash->nprocessed++; 678 match_found = PETSC_TRUE; 679 } 680 } 681 PetscFunctionReturn(0); 682 } 683 684 #if !defined(PETSC_HAVE_MPIUNI) 685 typedef struct { 686 PetscInt row; 687 PetscInt col; 688 PetscScalar vals[1]; /* Actually an array of length bs2 */ 689 } MatStashBlock; 690 691 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode) 692 { 693 PetscMatStashSpace space; 694 PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i; 695 PetscScalar **valptr; 696 697 PetscFunctionBegin; 698 PetscCall(PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm)); 699 for (space=stash->space_head,cnt=0; space; space=space->next) { 700 for (i=0; i<space->local_used; i++) { 701 row[cnt] = space->idx[i]; 702 col[cnt] = space->idy[i]; 703 valptr[cnt] = &space->val[i*bs2]; 704 perm[cnt] = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */ 705 cnt++; 706 } 707 } 708 PetscCheckFalse(cnt != n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries",n,cnt); 709 PetscCall(PetscSortIntWithArrayPair(n,row,col,perm)); 710 /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */ 711 for (rowstart=0,cnt=0,i=1; i<=n; i++) { 712 if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */ 713 PetscInt colstart; 714 PetscCall(PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart])); 715 for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */ 716 PetscInt j,l; 717 MatStashBlock *block; 718 PetscCall(PetscSegBufferGet(stash->segsendblocks,1,&block)); 719 block->row = row[rowstart]; 720 block->col = col[colstart]; 721 PetscCall(PetscArraycpy(block->vals,valptr[perm[colstart]],bs2)); 722 for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */ 723 if (insertmode == ADD_VALUES) { 724 for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l]; 725 } else { 726 PetscCall(PetscArraycpy(block->vals,valptr[perm[j]],bs2)); 727 } 728 } 729 colstart = j; 730 } 731 rowstart = i; 732 } 733 } 734 PetscCall(PetscFree4(row,col,valptr,perm)); 735 PetscFunctionReturn(0); 736 } 737 738 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) 739 { 740 PetscFunctionBegin; 741 if (stash->blocktype == MPI_DATATYPE_NULL) { 742 PetscInt bs2 = PetscSqr(stash->bs); 743 PetscMPIInt blocklens[2]; 744 MPI_Aint displs[2]; 745 MPI_Datatype types[2],stype; 746 /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex. 747 std::complex itself has standard layout, so does DummyBlock, recursively. 748 To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout, 749 though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++. 750 offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof. 751 752 We can test if std::complex has standard layout with the following code: 753 #include <iostream> 754 #include <type_traits> 755 #include <complex> 756 int main() { 757 std::cout << std::boolalpha; 758 std::cout << std::is_standard_layout<std::complex<double> >::value << '\n'; 759 } 760 Output: true 761 */ 762 struct DummyBlock {PetscInt row,col; PetscScalar vals;}; 763 764 stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar); 765 if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */ 766 stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt); 767 } 768 PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks)); 769 PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks)); 770 PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe)); 771 blocklens[0] = 2; 772 blocklens[1] = bs2; 773 displs[0] = offsetof(struct DummyBlock,row); 774 displs[1] = offsetof(struct DummyBlock,vals); 775 types[0] = MPIU_INT; 776 types[1] = MPIU_SCALAR; 777 PetscCallMPI(MPI_Type_create_struct(2,blocklens,displs,types,&stype)); 778 PetscCallMPI(MPI_Type_commit(&stype)); 779 PetscCallMPI(MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype)); 780 PetscCallMPI(MPI_Type_commit(&stash->blocktype)); 781 PetscCallMPI(MPI_Type_free(&stype)); 782 } 783 PetscFunctionReturn(0); 784 } 785 786 /* Callback invoked after target rank has initiatied receive of rendezvous message. 787 * Here we post the main sends. 788 */ 789 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx) 790 { 791 MatStash *stash = (MatStash*)ctx; 792 MatStashHeader *hdr = (MatStashHeader*)sdata; 793 794 PetscFunctionBegin; 795 PetscCheckFalse(rank != stash->sendranks[rankid],comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]); 796 PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0])); 797 stash->sendframes[rankid].count = hdr->count; 798 stash->sendframes[rankid].pending = 1; 799 PetscFunctionReturn(0); 800 } 801 802 /* Callback invoked by target after receiving rendezvous message. 803 * Here we post the main recvs. 804 */ 805 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx) 806 { 807 MatStash *stash = (MatStash*)ctx; 808 MatStashHeader *hdr = (MatStashHeader*)rdata; 809 MatStashFrame *frame; 810 811 PetscFunctionBegin; 812 PetscCall(PetscSegBufferGet(stash->segrecvframe,1,&frame)); 813 PetscCall(PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer)); 814 PetscCallMPI(MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0])); 815 frame->count = hdr->count; 816 frame->pending = 1; 817 PetscFunctionReturn(0); 818 } 819 820 /* 821 * owners[] contains the ownership ranges; may be indexed by either blocks or scalars 822 */ 823 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[]) 824 { 825 PetscErrorCode ierr; 826 size_t nblocks; 827 char *sendblocks; 828 829 PetscFunctionBegin; 830 if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */ 831 InsertMode addv; 832 PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat))); 833 PetscCheckFalse(addv == (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added"); 834 } 835 836 PetscCall(MatStashBlockTypeSetUp(stash)); 837 PetscCall(MatStashSortCompress_Private(stash,mat->insertmode)); 838 PetscCall(PetscSegBufferGetSize(stash->segsendblocks,&nblocks)); 839 PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks)); 840 if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */ 841 PetscInt i; 842 size_t b; 843 for (i=0,b=0; i<stash->nsendranks; i++) { 844 stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size]; 845 /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */ 846 stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */ 847 for (; b<nblocks; b++) { 848 MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size]; 849 PetscCheckFalse(sendblock_b->row < owners[stash->sendranks[i]],stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]); 850 if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break; 851 stash->sendhdr[i].count++; 852 } 853 } 854 } else { /* Dynamically count and pack (first time) */ 855 PetscInt sendno; 856 size_t i,rowstart; 857 858 /* Count number of send ranks and allocate for sends */ 859 stash->nsendranks = 0; 860 for (rowstart=0; rowstart<nblocks;) { 861 PetscInt owner; 862 MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size]; 863 PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner)); 864 if (owner < 0) owner = -(owner+2); 865 for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 866 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 867 if (sendblock_i->row >= owners[owner+1]) break; 868 } 869 stash->nsendranks++; 870 rowstart = i; 871 } 872 PetscCall(PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes)); 873 874 /* Set up sendhdrs and sendframes */ 875 sendno = 0; 876 for (rowstart=0; rowstart<nblocks;) { 877 PetscInt owner; 878 MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size]; 879 PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner)); 880 if (owner < 0) owner = -(owner+2); 881 stash->sendranks[sendno] = owner; 882 for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 883 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 884 if (sendblock_i->row >= owners[owner+1]) break; 885 } 886 stash->sendframes[sendno].buffer = sendblock_rowstart; 887 stash->sendframes[sendno].pending = 0; 888 stash->sendhdr[sendno].count = i - rowstart; 889 sendno++; 890 rowstart = i; 891 } 892 PetscCheckFalse(sendno != stash->nsendranks,stash->comm,PETSC_ERR_PLIB,"BTS counted %d sendranks, but %" PetscInt_FMT " sends",stash->nsendranks,sendno); 893 } 894 895 /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new 896 * message or a dummy entry of some sort. */ 897 if (mat->insertmode == INSERT_VALUES) { 898 size_t i; 899 for (i=0; i<nblocks; i++) { 900 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 901 sendblock_i->row = -(sendblock_i->row+1); 902 } 903 } 904 905 if (stash->first_assembly_done) { 906 PetscMPIInt i,tag; 907 PetscCall(PetscCommGetNewTag(stash->comm,&tag)); 908 for (i=0; i<stash->nrecvranks; i++) { 909 PetscCall(MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash)); 910 } 911 for (i=0; i<stash->nsendranks; i++) { 912 PetscCall(MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash)); 913 } 914 stash->use_status = PETSC_TRUE; /* Use count from message status. */ 915 } else { 916 ierr = PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr, 917 &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs, 918 MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);PetscCall(ierr); 919 PetscCall(PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses)); 920 stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */ 921 } 922 923 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes)); 924 stash->recvframe_active = NULL; 925 stash->recvframe_i = 0; 926 stash->some_i = 0; 927 stash->some_count = 0; 928 stash->recvcount = 0; 929 stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */ 930 stash->insertmode = &mat->insertmode; 931 PetscFunctionReturn(0); 932 } 933 934 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg) 935 { 936 MatStashBlock *block; 937 938 PetscFunctionBegin; 939 *flg = 0; 940 while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) { 941 if (stash->some_i == stash->some_count) { 942 if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */ 943 PetscCallMPI(MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE)); 944 stash->some_i = 0; 945 } 946 stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]]; 947 stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */ 948 if (stash->use_status) { /* Count what was actually sent */ 949 PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count)); 950 } 951 if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */ 952 block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0]; 953 if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES; 954 PetscCheckFalse(*stash->insertmode == INSERT_VALUES && block->row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]); 955 PetscCheckFalse(*stash->insertmode == ADD_VALUES && block->row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]); 956 } 957 stash->some_i++; 958 stash->recvcount++; 959 stash->recvframe_i = 0; 960 } 961 *n = 1; 962 block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size]; 963 if (block->row < 0) block->row = -(block->row + 1); 964 *row = &block->row; 965 *col = &block->col; 966 *val = block->vals; 967 stash->recvframe_i++; 968 *flg = 1; 969 PetscFunctionReturn(0); 970 } 971 972 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash) 973 { 974 PetscFunctionBegin; 975 PetscCallMPI(MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE)); 976 if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks */ 977 void *dummy; 978 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy)); 979 } else { /* No reuse, so collect everything. */ 980 PetscCall(MatStashScatterDestroy_BTS(stash)); 981 } 982 983 /* Now update nmaxold to be app 10% more than max n used, this way the 984 wastage of space is reduced the next time this stash is used. 985 Also update the oldmax, only if it increases */ 986 if (stash->n) { 987 PetscInt bs2 = stash->bs*stash->bs; 988 PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2; 989 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 990 } 991 992 stash->nmax = 0; 993 stash->n = 0; 994 stash->reallocs = -1; 995 stash->nprocessed = 0; 996 997 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 998 999 stash->space = NULL; 1000 1001 PetscFunctionReturn(0); 1002 } 1003 1004 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash) 1005 { 1006 PetscFunctionBegin; 1007 PetscCall(PetscSegBufferDestroy(&stash->segsendblocks)); 1008 PetscCall(PetscSegBufferDestroy(&stash->segrecvframe)); 1009 stash->recvframes = NULL; 1010 PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks)); 1011 if (stash->blocktype != MPI_DATATYPE_NULL) { 1012 PetscCallMPI(MPI_Type_free(&stash->blocktype)); 1013 } 1014 stash->nsendranks = 0; 1015 stash->nrecvranks = 0; 1016 PetscCall(PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes)); 1017 PetscCall(PetscFree(stash->sendreqs)); 1018 PetscCall(PetscFree(stash->recvreqs)); 1019 PetscCall(PetscFree(stash->recvranks)); 1020 PetscCall(PetscFree(stash->recvhdr)); 1021 PetscCall(PetscFree2(stash->some_indices,stash->some_statuses)); 1022 PetscFunctionReturn(0); 1023 } 1024 #endif 1025