1 2 #include <petsc/private/matimpl.h> 3 4 #define DEFAULT_STASH_SIZE 10000 5 6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*); 7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*); 8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*); 9 #if !defined(PETSC_HAVE_MPIUNI) 10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*); 11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*); 12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*); 13 #endif 14 15 /* 16 MatStashCreate_Private - Creates a stash,currently used for all the parallel 17 matrix implementations. The stash is where elements of a matrix destined 18 to be stored on other processors are kept until matrix assembly is done. 19 20 This is a simple minded stash. Simply adds entries to end of stash. 21 22 Input Parameters: 23 comm - communicator, required for scatters. 24 bs - stash block size. used when stashing blocks of values 25 26 Output Parameters: 27 stash - the newly created stash 28 */ 29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash) 30 { 31 PetscInt max,*opt,nopt,i; 32 PetscBool flg; 33 34 PetscFunctionBegin; 35 /* Require 2 tags,get the second using PetscCommGetNewTag() */ 36 stash->comm = comm; 37 38 PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag1)); 39 PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag2)); 40 PetscCallMPI(MPI_Comm_size(stash->comm,&stash->size)); 41 PetscCallMPI(MPI_Comm_rank(stash->comm,&stash->rank)); 42 PetscCall(PetscMalloc1(2*stash->size,&stash->flg_v)); 43 for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1; 44 45 nopt = stash->size; 46 PetscCall(PetscMalloc1(nopt,&opt)); 47 PetscCall(PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg)); 48 if (flg) { 49 if (nopt == 1) max = opt[0]; 50 else if (nopt == stash->size) max = opt[stash->rank]; 51 else if (stash->rank < nopt) max = opt[stash->rank]; 52 else max = 0; /* Use default */ 53 stash->umax = max; 54 } else { 55 stash->umax = 0; 56 } 57 PetscCall(PetscFree(opt)); 58 if (bs <= 0) bs = 1; 59 60 stash->bs = bs; 61 stash->nmax = 0; 62 stash->oldnmax = 0; 63 stash->n = 0; 64 stash->reallocs = -1; 65 stash->space_head = NULL; 66 stash->space = NULL; 67 68 stash->send_waits = NULL; 69 stash->recv_waits = NULL; 70 stash->send_status = NULL; 71 stash->nsends = 0; 72 stash->nrecvs = 0; 73 stash->svalues = NULL; 74 stash->rvalues = NULL; 75 stash->rindices = NULL; 76 stash->nprocessed = 0; 77 stash->reproduce = PETSC_FALSE; 78 stash->blocktype = MPI_DATATYPE_NULL; 79 80 PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL)); 81 #if !defined(PETSC_HAVE_MPIUNI) 82 flg = PETSC_FALSE; 83 PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL)); 84 if (!flg) { 85 stash->ScatterBegin = MatStashScatterBegin_BTS; 86 stash->ScatterGetMesg = MatStashScatterGetMesg_BTS; 87 stash->ScatterEnd = MatStashScatterEnd_BTS; 88 stash->ScatterDestroy = MatStashScatterDestroy_BTS; 89 } else { 90 #endif 91 stash->ScatterBegin = MatStashScatterBegin_Ref; 92 stash->ScatterGetMesg = MatStashScatterGetMesg_Ref; 93 stash->ScatterEnd = MatStashScatterEnd_Ref; 94 stash->ScatterDestroy = NULL; 95 #if !defined(PETSC_HAVE_MPIUNI) 96 } 97 #endif 98 PetscFunctionReturn(0); 99 } 100 101 /* 102 MatStashDestroy_Private - Destroy the stash 103 */ 104 PetscErrorCode MatStashDestroy_Private(MatStash *stash) 105 { 106 PetscFunctionBegin; 107 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 108 if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash)); 109 110 stash->space = NULL; 111 112 PetscCall(PetscFree(stash->flg_v)); 113 PetscFunctionReturn(0); 114 } 115 116 /* 117 MatStashScatterEnd_Private - This is called as the final stage of 118 scatter. The final stages of message passing is done here, and 119 all the memory used for message passing is cleaned up. This 120 routine also resets the stash, and deallocates the memory used 121 for the stash. It also keeps track of the current memory usage 122 so that the same value can be used the next time through. 123 */ 124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash) 125 { 126 PetscFunctionBegin; 127 PetscCall((*stash->ScatterEnd)(stash)); 128 PetscFunctionReturn(0); 129 } 130 131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash) 132 { 133 PetscInt nsends=stash->nsends,bs2,oldnmax,i; 134 MPI_Status *send_status; 135 136 PetscFunctionBegin; 137 for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1; 138 /* wait on sends */ 139 if (nsends) { 140 PetscCall(PetscMalloc1(2*nsends,&send_status)); 141 PetscCallMPI(MPI_Waitall(2*nsends,stash->send_waits,send_status)); 142 PetscCall(PetscFree(send_status)); 143 } 144 145 /* Now update nmaxold to be app 10% more than max n used, this way the 146 wastage of space is reduced the next time this stash is used. 147 Also update the oldmax, only if it increases */ 148 if (stash->n) { 149 bs2 = stash->bs*stash->bs; 150 oldnmax = ((int)(stash->n * 1.1) + 5)*bs2; 151 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 152 } 153 154 stash->nmax = 0; 155 stash->n = 0; 156 stash->reallocs = -1; 157 stash->nprocessed = 0; 158 159 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 160 161 stash->space = NULL; 162 163 PetscCall(PetscFree(stash->send_waits)); 164 PetscCall(PetscFree(stash->recv_waits)); 165 PetscCall(PetscFree2(stash->svalues,stash->sindices)); 166 PetscCall(PetscFree(stash->rvalues[0])); 167 PetscCall(PetscFree(stash->rvalues)); 168 PetscCall(PetscFree(stash->rindices[0])); 169 PetscCall(PetscFree(stash->rindices)); 170 PetscFunctionReturn(0); 171 } 172 173 /* 174 MatStashGetInfo_Private - Gets the relevant statistics of the stash 175 176 Input Parameters: 177 stash - the stash 178 nstash - the size of the stash. Indicates the number of values stored. 179 reallocs - the number of additional mallocs incurred. 180 181 */ 182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs) 183 { 184 PetscInt bs2 = stash->bs*stash->bs; 185 186 PetscFunctionBegin; 187 if (nstash) *nstash = stash->n*bs2; 188 if (reallocs) { 189 if (stash->reallocs < 0) *reallocs = 0; 190 else *reallocs = stash->reallocs; 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /* 196 MatStashSetInitialSize_Private - Sets the initial size of the stash 197 198 Input Parameters: 199 stash - the stash 200 max - the value that is used as the max size of the stash. 201 this value is used while allocating memory. 202 */ 203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max) 204 { 205 PetscFunctionBegin; 206 stash->umax = max; 207 PetscFunctionReturn(0); 208 } 209 210 /* MatStashExpand_Private - Expand the stash. This function is called 211 when the space in the stash is not sufficient to add the new values 212 being inserted into the stash. 213 214 Input Parameters: 215 stash - the stash 216 incr - the minimum increase requested 217 218 Notes: 219 This routine doubles the currently used memory. 220 */ 221 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr) 222 { 223 PetscInt newnmax,bs2= stash->bs*stash->bs; 224 225 PetscFunctionBegin; 226 /* allocate a larger stash */ 227 if (!stash->oldnmax && !stash->nmax) { /* new stash */ 228 if (stash->umax) newnmax = stash->umax/bs2; 229 else newnmax = DEFAULT_STASH_SIZE/bs2; 230 } else if (!stash->nmax) { /* resuing stash */ 231 if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2; 232 else newnmax = stash->oldnmax/bs2; 233 } else newnmax = stash->nmax*2; 234 if (newnmax < (stash->nmax + incr)) newnmax += 2*incr; 235 236 /* Get a MatStashSpace and attach it to stash */ 237 PetscCall(PetscMatStashSpaceGet(bs2,newnmax,&stash->space)); 238 if (!stash->space_head) { /* new stash or resuing stash->oldnmax */ 239 stash->space_head = stash->space; 240 } 241 242 stash->reallocs++; 243 stash->nmax = newnmax; 244 PetscFunctionReturn(0); 245 } 246 /* 247 MatStashValuesRow_Private - inserts values into the stash. This function 248 expects the values to be roworiented. Multiple columns belong to the same row 249 can be inserted with a single call to this function. 250 251 Input Parameters: 252 stash - the stash 253 row - the global row correspoiding to the values 254 n - the number of elements inserted. All elements belong to the above row. 255 idxn - the global column indices corresponding to each of the values. 256 values - the values inserted 257 */ 258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries) 259 { 260 PetscInt i,k,cnt = 0; 261 PetscMatStashSpace space=stash->space; 262 263 PetscFunctionBegin; 264 /* Check and see if we have sufficient memory */ 265 if (!space || space->local_remaining < n) { 266 PetscCall(MatStashExpand_Private(stash,n)); 267 } 268 space = stash->space; 269 k = space->local_used; 270 for (i=0; i<n; i++) { 271 if (ignorezeroentries && values && values[i] == 0.0) continue; 272 space->idx[k] = row; 273 space->idy[k] = idxn[i]; 274 space->val[k] = values ? values[i] : 0.0; 275 k++; 276 cnt++; 277 } 278 stash->n += cnt; 279 space->local_used += cnt; 280 space->local_remaining -= cnt; 281 PetscFunctionReturn(0); 282 } 283 284 /* 285 MatStashValuesCol_Private - inserts values into the stash. This function 286 expects the values to be columnoriented. Multiple columns belong to the same row 287 can be inserted with a single call to this function. 288 289 Input Parameters: 290 stash - the stash 291 row - the global row correspoiding to the values 292 n - the number of elements inserted. All elements belong to the above row. 293 idxn - the global column indices corresponding to each of the values. 294 values - the values inserted 295 stepval - the consecutive values are sepated by a distance of stepval. 296 this happens because the input is columnoriented. 297 */ 298 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries) 299 { 300 PetscInt i,k,cnt = 0; 301 PetscMatStashSpace space=stash->space; 302 303 PetscFunctionBegin; 304 /* Check and see if we have sufficient memory */ 305 if (!space || space->local_remaining < n) { 306 PetscCall(MatStashExpand_Private(stash,n)); 307 } 308 space = stash->space; 309 k = space->local_used; 310 for (i=0; i<n; i++) { 311 if (ignorezeroentries && values && values[i*stepval] == 0.0) continue; 312 space->idx[k] = row; 313 space->idy[k] = idxn[i]; 314 space->val[k] = values ? values[i*stepval] : 0.0; 315 k++; 316 cnt++; 317 } 318 stash->n += cnt; 319 space->local_used += cnt; 320 space->local_remaining -= cnt; 321 PetscFunctionReturn(0); 322 } 323 324 /* 325 MatStashValuesRowBlocked_Private - inserts blocks of values into the stash. 326 This function expects the values to be roworiented. Multiple columns belong 327 to the same block-row can be inserted with a single call to this function. 328 This function extracts the sub-block of values based on the dimensions of 329 the original input block, and the row,col values corresponding to the blocks. 330 331 Input Parameters: 332 stash - the stash 333 row - the global block-row correspoiding to the values 334 n - the number of elements inserted. All elements belong to the above row. 335 idxn - the global block-column indices corresponding to each of the blocks of 336 values. Each block is of size bs*bs. 337 values - the values inserted 338 rmax - the number of block-rows in the original block. 339 cmax - the number of block-columns on the original block. 340 idx - the index of the current block-row in the original block. 341 */ 342 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx) 343 { 344 PetscInt i,j,k,bs2,bs=stash->bs,l; 345 const PetscScalar *vals; 346 PetscScalar *array; 347 PetscMatStashSpace space=stash->space; 348 349 PetscFunctionBegin; 350 if (!space || space->local_remaining < n) { 351 PetscCall(MatStashExpand_Private(stash,n)); 352 } 353 space = stash->space; 354 l = space->local_used; 355 bs2 = bs*bs; 356 for (i=0; i<n; i++) { 357 space->idx[l] = row; 358 space->idy[l] = idxn[i]; 359 /* Now copy over the block of values. Store the values column oriented. 360 This enables inserting multiple blocks belonging to a row with a single 361 funtion call */ 362 array = space->val + bs2*l; 363 vals = values + idx*bs2*n + bs*i; 364 for (j=0; j<bs; j++) { 365 for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0; 366 array++; 367 vals += cmax*bs; 368 } 369 l++; 370 } 371 stash->n += n; 372 space->local_used += n; 373 space->local_remaining -= n; 374 PetscFunctionReturn(0); 375 } 376 377 /* 378 MatStashValuesColBlocked_Private - inserts blocks of values into the stash. 379 This function expects the values to be roworiented. Multiple columns belong 380 to the same block-row can be inserted with a single call to this function. 381 This function extracts the sub-block of values based on the dimensions of 382 the original input block, and the row,col values corresponding to the blocks. 383 384 Input Parameters: 385 stash - the stash 386 row - the global block-row correspoiding to the values 387 n - the number of elements inserted. All elements belong to the above row. 388 idxn - the global block-column indices corresponding to each of the blocks of 389 values. Each block is of size bs*bs. 390 values - the values inserted 391 rmax - the number of block-rows in the original block. 392 cmax - the number of block-columns on the original block. 393 idx - the index of the current block-row in the original block. 394 */ 395 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx) 396 { 397 PetscInt i,j,k,bs2,bs=stash->bs,l; 398 const PetscScalar *vals; 399 PetscScalar *array; 400 PetscMatStashSpace space=stash->space; 401 402 PetscFunctionBegin; 403 if (!space || space->local_remaining < n) { 404 PetscCall(MatStashExpand_Private(stash,n)); 405 } 406 space = stash->space; 407 l = space->local_used; 408 bs2 = bs*bs; 409 for (i=0; i<n; i++) { 410 space->idx[l] = row; 411 space->idy[l] = idxn[i]; 412 /* Now copy over the block of values. Store the values column oriented. 413 This enables inserting multiple blocks belonging to a row with a single 414 funtion call */ 415 array = space->val + bs2*l; 416 vals = values + idx*bs2*n + bs*i; 417 for (j=0; j<bs; j++) { 418 for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0; 419 array += bs; 420 vals += rmax*bs; 421 } 422 l++; 423 } 424 stash->n += n; 425 space->local_used += n; 426 space->local_remaining -= n; 427 PetscFunctionReturn(0); 428 } 429 /* 430 MatStashScatterBegin_Private - Initiates the transfer of values to the 431 correct owners. This function goes through the stash, and check the 432 owners of each stashed value, and sends the values off to the owner 433 processors. 434 435 Input Parameters: 436 stash - the stash 437 owners - an array of size 'no-of-procs' which gives the ownership range 438 for each node. 439 440 Notes: 441 The 'owners' array in the cased of the blocked-stash has the 442 ranges specified blocked global indices, and for the regular stash in 443 the proper global indices. 444 */ 445 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners) 446 { 447 PetscFunctionBegin; 448 PetscCall((*stash->ScatterBegin)(mat,stash,owners)); 449 PetscFunctionReturn(0); 450 } 451 452 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners) 453 { 454 PetscInt *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2; 455 PetscInt size=stash->size,nsends; 456 PetscInt count,*sindices,**rindices,i,j,idx,lastidx,l; 457 PetscScalar **rvalues,*svalues; 458 MPI_Comm comm = stash->comm; 459 MPI_Request *send_waits,*recv_waits,*recv_waits1,*recv_waits2; 460 PetscMPIInt *sizes,*nlengths,nreceives; 461 PetscInt *sp_idx,*sp_idy; 462 PetscScalar *sp_val; 463 PetscMatStashSpace space,space_next; 464 465 PetscFunctionBegin; 466 { /* make sure all processors are either in INSERTMODE or ADDMODE */ 467 InsertMode addv; 468 PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat))); 469 PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added"); 470 mat->insertmode = addv; /* in case this processor had no cache */ 471 } 472 473 bs2 = stash->bs*stash->bs; 474 475 /* first count number of contributors to each processor */ 476 PetscCall(PetscCalloc1(size,&nlengths)); 477 PetscCall(PetscMalloc1(stash->n+1,&owner)); 478 479 i = j = 0; 480 lastidx = -1; 481 space = stash->space_head; 482 while (space) { 483 space_next = space->next; 484 sp_idx = space->idx; 485 for (l=0; l<space->local_used; l++) { 486 /* if indices are NOT locally sorted, need to start search at the beginning */ 487 if (lastidx > (idx = sp_idx[l])) j = 0; 488 lastidx = idx; 489 for (; j<size; j++) { 490 if (idx >= owners[j] && idx < owners[j+1]) { 491 nlengths[j]++; owner[i] = j; break; 492 } 493 } 494 i++; 495 } 496 space = space_next; 497 } 498 499 /* Now check what procs get messages - and compute nsends. */ 500 PetscCall(PetscCalloc1(size,&sizes)); 501 for (i=0, nsends=0; i<size; i++) { 502 if (nlengths[i]) { 503 sizes[i] = 1; nsends++; 504 } 505 } 506 507 {PetscMPIInt *onodes,*olengths; 508 /* Determine the number of messages to expect, their lengths, from from-ids */ 509 PetscCall(PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives)); 510 PetscCall(PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths)); 511 /* since clubbing row,col - lengths are multiplied by 2 */ 512 for (i=0; i<nreceives; i++) olengths[i] *=2; 513 PetscCall(PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1)); 514 /* values are size 'bs2' lengths (and remove earlier factor 2 */ 515 for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2; 516 PetscCall(PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2)); 517 PetscCall(PetscFree(onodes)); 518 PetscCall(PetscFree(olengths));} 519 520 /* do sends: 521 1) starts[i] gives the starting index in svalues for stuff going to 522 the ith processor 523 */ 524 PetscCall(PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices)); 525 PetscCall(PetscMalloc1(2*nsends,&send_waits)); 526 PetscCall(PetscMalloc2(size,&startv,size,&starti)); 527 /* use 2 sends the first with all_a, the next with all_i and all_j */ 528 startv[0] = 0; starti[0] = 0; 529 for (i=1; i<size; i++) { 530 startv[i] = startv[i-1] + nlengths[i-1]; 531 starti[i] = starti[i-1] + 2*nlengths[i-1]; 532 } 533 534 i = 0; 535 space = stash->space_head; 536 while (space) { 537 space_next = space->next; 538 sp_idx = space->idx; 539 sp_idy = space->idy; 540 sp_val = space->val; 541 for (l=0; l<space->local_used; l++) { 542 j = owner[i]; 543 if (bs2 == 1) { 544 svalues[startv[j]] = sp_val[l]; 545 } else { 546 PetscInt k; 547 PetscScalar *buf1,*buf2; 548 buf1 = svalues+bs2*startv[j]; 549 buf2 = space->val + bs2*l; 550 for (k=0; k<bs2; k++) buf1[k] = buf2[k]; 551 } 552 sindices[starti[j]] = sp_idx[l]; 553 sindices[starti[j]+nlengths[j]] = sp_idy[l]; 554 startv[j]++; 555 starti[j]++; 556 i++; 557 } 558 space = space_next; 559 } 560 startv[0] = 0; 561 for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1]; 562 563 for (i=0,count=0; i<size; i++) { 564 if (sizes[i]) { 565 PetscCallMPI(MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++)); 566 PetscCallMPI(MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++)); 567 } 568 } 569 #if defined(PETSC_USE_INFO) 570 PetscCall(PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends)); 571 for (i=0; i<size; i++) { 572 if (sizes[i]) { 573 PetscCall(PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt))))); 574 } 575 } 576 #endif 577 PetscCall(PetscFree(nlengths)); 578 PetscCall(PetscFree(owner)); 579 PetscCall(PetscFree2(startv,starti)); 580 PetscCall(PetscFree(sizes)); 581 582 /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */ 583 PetscCall(PetscMalloc1(2*nreceives,&recv_waits)); 584 585 for (i=0; i<nreceives; i++) { 586 recv_waits[2*i] = recv_waits1[i]; 587 recv_waits[2*i+1] = recv_waits2[i]; 588 } 589 stash->recv_waits = recv_waits; 590 591 PetscCall(PetscFree(recv_waits1)); 592 PetscCall(PetscFree(recv_waits2)); 593 594 stash->svalues = svalues; 595 stash->sindices = sindices; 596 stash->rvalues = rvalues; 597 stash->rindices = rindices; 598 stash->send_waits = send_waits; 599 stash->nsends = nsends; 600 stash->nrecvs = nreceives; 601 stash->reproduce_count = 0; 602 PetscFunctionReturn(0); 603 } 604 605 /* 606 MatStashScatterGetMesg_Private - This function waits on the receives posted 607 in the function MatStashScatterBegin_Private() and returns one message at 608 a time to the calling function. If no messages are left, it indicates this 609 by setting flg = 0, else it sets flg = 1. 610 611 Input Parameters: 612 stash - the stash 613 614 Output Parameters: 615 nvals - the number of entries in the current message. 616 rows - an array of row indices (or blocked indices) corresponding to the values 617 cols - an array of columnindices (or blocked indices) corresponding to the values 618 vals - the values 619 flg - 0 indicates no more message left, and the current call has no values associated. 620 1 indicates that the current call successfully received a message, and the 621 other output parameters nvals,rows,cols,vals are set appropriately. 622 */ 623 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg) 624 { 625 PetscFunctionBegin; 626 PetscCall((*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg)); 627 PetscFunctionReturn(0); 628 } 629 630 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg) 631 { 632 PetscMPIInt i,*flg_v = stash->flg_v,i1,i2; 633 PetscInt bs2; 634 MPI_Status recv_status; 635 PetscBool match_found = PETSC_FALSE; 636 637 PetscFunctionBegin; 638 *flg = 0; /* When a message is discovered this is reset to 1 */ 639 /* Return if no more messages to process */ 640 if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0); 641 642 bs2 = stash->bs*stash->bs; 643 /* If a matching pair of receives are found, process them, and return the data to 644 the calling function. Until then keep receiving messages */ 645 while (!match_found) { 646 if (stash->reproduce) { 647 i = stash->reproduce_count++; 648 PetscCallMPI(MPI_Wait(stash->recv_waits+i,&recv_status)); 649 } else { 650 PetscCallMPI(MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status)); 651 } 652 PetscCheck(recv_status.MPI_SOURCE >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!"); 653 654 /* Now pack the received message into a structure which is usable by others */ 655 if (i % 2) { 656 PetscCallMPI(MPI_Get_count(&recv_status,MPIU_SCALAR,nvals)); 657 flg_v[2*recv_status.MPI_SOURCE] = i/2; 658 *nvals = *nvals/bs2; 659 } else { 660 PetscCallMPI(MPI_Get_count(&recv_status,MPIU_INT,nvals)); 661 flg_v[2*recv_status.MPI_SOURCE+1] = i/2; 662 *nvals = *nvals/2; /* This message has both row indices and col indices */ 663 } 664 665 /* Check if we have both messages from this proc */ 666 i1 = flg_v[2*recv_status.MPI_SOURCE]; 667 i2 = flg_v[2*recv_status.MPI_SOURCE+1]; 668 if (i1 != -1 && i2 != -1) { 669 *rows = stash->rindices[i2]; 670 *cols = *rows + *nvals; 671 *vals = stash->rvalues[i1]; 672 *flg = 1; 673 stash->nprocessed++; 674 match_found = PETSC_TRUE; 675 } 676 } 677 PetscFunctionReturn(0); 678 } 679 680 #if !defined(PETSC_HAVE_MPIUNI) 681 typedef struct { 682 PetscInt row; 683 PetscInt col; 684 PetscScalar vals[1]; /* Actually an array of length bs2 */ 685 } MatStashBlock; 686 687 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode) 688 { 689 PetscMatStashSpace space; 690 PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i; 691 PetscScalar **valptr; 692 693 PetscFunctionBegin; 694 PetscCall(PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm)); 695 for (space=stash->space_head,cnt=0; space; space=space->next) { 696 for (i=0; i<space->local_used; i++) { 697 row[cnt] = space->idx[i]; 698 col[cnt] = space->idy[i]; 699 valptr[cnt] = &space->val[i*bs2]; 700 perm[cnt] = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */ 701 cnt++; 702 } 703 } 704 PetscCheck(cnt == n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries",n,cnt); 705 PetscCall(PetscSortIntWithArrayPair(n,row,col,perm)); 706 /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */ 707 for (rowstart=0,cnt=0,i=1; i<=n; i++) { 708 if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */ 709 PetscInt colstart; 710 PetscCall(PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart])); 711 for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */ 712 PetscInt j,l; 713 MatStashBlock *block; 714 PetscCall(PetscSegBufferGet(stash->segsendblocks,1,&block)); 715 block->row = row[rowstart]; 716 block->col = col[colstart]; 717 PetscCall(PetscArraycpy(block->vals,valptr[perm[colstart]],bs2)); 718 for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */ 719 if (insertmode == ADD_VALUES) { 720 for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l]; 721 } else { 722 PetscCall(PetscArraycpy(block->vals,valptr[perm[j]],bs2)); 723 } 724 } 725 colstart = j; 726 } 727 rowstart = i; 728 } 729 } 730 PetscCall(PetscFree4(row,col,valptr,perm)); 731 PetscFunctionReturn(0); 732 } 733 734 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) 735 { 736 PetscFunctionBegin; 737 if (stash->blocktype == MPI_DATATYPE_NULL) { 738 PetscInt bs2 = PetscSqr(stash->bs); 739 PetscMPIInt blocklens[2]; 740 MPI_Aint displs[2]; 741 MPI_Datatype types[2],stype; 742 /* 743 DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex. 744 std::complex itself has standard layout, so does DummyBlock, recursively. 745 To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout, 746 though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++. 747 offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof. 748 749 We can test if std::complex has standard layout with the following code: 750 #include <iostream> 751 #include <type_traits> 752 #include <complex> 753 int main() { 754 std::cout << std::boolalpha; 755 std::cout << std::is_standard_layout<std::complex<double> >::value << '\n'; 756 } 757 Output: true 758 */ 759 struct DummyBlock {PetscInt row,col; PetscScalar vals;}; 760 761 stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar); 762 if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */ 763 stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt); 764 } 765 PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks)); 766 PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks)); 767 PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe)); 768 blocklens[0] = 2; 769 blocklens[1] = bs2; 770 displs[0] = offsetof(struct DummyBlock,row); 771 displs[1] = offsetof(struct DummyBlock,vals); 772 types[0] = MPIU_INT; 773 types[1] = MPIU_SCALAR; 774 PetscCallMPI(MPI_Type_create_struct(2,blocklens,displs,types,&stype)); 775 PetscCallMPI(MPI_Type_commit(&stype)); 776 PetscCallMPI(MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype)); 777 PetscCallMPI(MPI_Type_commit(&stash->blocktype)); 778 PetscCallMPI(MPI_Type_free(&stype)); 779 } 780 PetscFunctionReturn(0); 781 } 782 783 /* Callback invoked after target rank has initiatied receive of rendezvous message. 784 * Here we post the main sends. 785 */ 786 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx) 787 { 788 MatStash *stash = (MatStash*)ctx; 789 MatStashHeader *hdr = (MatStashHeader*)sdata; 790 791 PetscFunctionBegin; 792 PetscCheck(rank == stash->sendranks[rankid],comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]); 793 PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0])); 794 stash->sendframes[rankid].count = hdr->count; 795 stash->sendframes[rankid].pending = 1; 796 PetscFunctionReturn(0); 797 } 798 799 /* 800 Callback invoked by target after receiving rendezvous message. 801 Here we post the main recvs. 802 */ 803 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx) 804 { 805 MatStash *stash = (MatStash*)ctx; 806 MatStashHeader *hdr = (MatStashHeader*)rdata; 807 MatStashFrame *frame; 808 809 PetscFunctionBegin; 810 PetscCall(PetscSegBufferGet(stash->segrecvframe,1,&frame)); 811 PetscCall(PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer)); 812 PetscCallMPI(MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0])); 813 frame->count = hdr->count; 814 frame->pending = 1; 815 PetscFunctionReturn(0); 816 } 817 818 /* 819 * owners[] contains the ownership ranges; may be indexed by either blocks or scalars 820 */ 821 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[]) 822 { 823 size_t nblocks; 824 char *sendblocks; 825 826 PetscFunctionBegin; 827 if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */ 828 InsertMode addv; 829 PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat))); 830 PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added"); 831 } 832 833 PetscCall(MatStashBlockTypeSetUp(stash)); 834 PetscCall(MatStashSortCompress_Private(stash,mat->insertmode)); 835 PetscCall(PetscSegBufferGetSize(stash->segsendblocks,&nblocks)); 836 PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks)); 837 if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */ 838 PetscInt i; 839 size_t b; 840 for (i=0,b=0; i<stash->nsendranks; i++) { 841 stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size]; 842 /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */ 843 stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */ 844 for (; b<nblocks; b++) { 845 MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size]; 846 PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]],stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]); 847 if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break; 848 stash->sendhdr[i].count++; 849 } 850 } 851 } else { /* Dynamically count and pack (first time) */ 852 PetscInt sendno; 853 size_t i,rowstart; 854 855 /* Count number of send ranks and allocate for sends */ 856 stash->nsendranks = 0; 857 for (rowstart=0; rowstart<nblocks;) { 858 PetscInt owner; 859 MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size]; 860 PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner)); 861 if (owner < 0) owner = -(owner+2); 862 for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 863 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 864 if (sendblock_i->row >= owners[owner+1]) break; 865 } 866 stash->nsendranks++; 867 rowstart = i; 868 } 869 PetscCall(PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes)); 870 871 /* Set up sendhdrs and sendframes */ 872 sendno = 0; 873 for (rowstart=0; rowstart<nblocks;) { 874 PetscInt owner; 875 MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size]; 876 PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner)); 877 if (owner < 0) owner = -(owner+2); 878 stash->sendranks[sendno] = owner; 879 for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 880 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 881 if (sendblock_i->row >= owners[owner+1]) break; 882 } 883 stash->sendframes[sendno].buffer = sendblock_rowstart; 884 stash->sendframes[sendno].pending = 0; 885 stash->sendhdr[sendno].count = i - rowstart; 886 sendno++; 887 rowstart = i; 888 } 889 PetscCheck(sendno == stash->nsendranks,stash->comm,PETSC_ERR_PLIB,"BTS counted %d sendranks, but %" PetscInt_FMT " sends",stash->nsendranks,sendno); 890 } 891 892 /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new 893 * message or a dummy entry of some sort. */ 894 if (mat->insertmode == INSERT_VALUES) { 895 size_t i; 896 for (i=0; i<nblocks; i++) { 897 MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size]; 898 sendblock_i->row = -(sendblock_i->row+1); 899 } 900 } 901 902 if (stash->first_assembly_done) { 903 PetscMPIInt i,tag; 904 PetscCall(PetscCommGetNewTag(stash->comm,&tag)); 905 for (i=0; i<stash->nrecvranks; i++) { 906 PetscCall(MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash)); 907 } 908 for (i=0; i<stash->nsendranks; i++) { 909 PetscCall(MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash)); 910 } 911 stash->use_status = PETSC_TRUE; /* Use count from message status. */ 912 } else { 913 PetscCall(PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr, 914 &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs, 915 MatStashBTSSend_Private,MatStashBTSRecv_Private,stash)); 916 PetscCall(PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses)); 917 stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */ 918 } 919 920 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes)); 921 stash->recvframe_active = NULL; 922 stash->recvframe_i = 0; 923 stash->some_i = 0; 924 stash->some_count = 0; 925 stash->recvcount = 0; 926 stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */ 927 stash->insertmode = &mat->insertmode; 928 PetscFunctionReturn(0); 929 } 930 931 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg) 932 { 933 MatStashBlock *block; 934 935 PetscFunctionBegin; 936 *flg = 0; 937 while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) { 938 if (stash->some_i == stash->some_count) { 939 if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */ 940 PetscCallMPI(MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE)); 941 stash->some_i = 0; 942 } 943 stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]]; 944 stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */ 945 if (stash->use_status) { /* Count what was actually sent */ 946 PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count)); 947 } 948 if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */ 949 block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0]; 950 if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES; 951 PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]); 952 PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]); 953 } 954 stash->some_i++; 955 stash->recvcount++; 956 stash->recvframe_i = 0; 957 } 958 *n = 1; 959 block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size]; 960 if (block->row < 0) block->row = -(block->row + 1); 961 *row = &block->row; 962 *col = &block->col; 963 *val = block->vals; 964 stash->recvframe_i++; 965 *flg = 1; 966 PetscFunctionReturn(0); 967 } 968 969 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash) 970 { 971 PetscFunctionBegin; 972 PetscCallMPI(MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE)); 973 if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks */ 974 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks,NULL)); 975 } else { /* No reuse, so collect everything. */ 976 PetscCall(MatStashScatterDestroy_BTS(stash)); 977 } 978 979 /* Now update nmaxold to be app 10% more than max n used, this way the 980 wastage of space is reduced the next time this stash is used. 981 Also update the oldmax, only if it increases */ 982 if (stash->n) { 983 PetscInt bs2 = stash->bs*stash->bs; 984 PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2; 985 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 986 } 987 988 stash->nmax = 0; 989 stash->n = 0; 990 stash->reallocs = -1; 991 stash->nprocessed = 0; 992 993 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 994 995 stash->space = NULL; 996 997 PetscFunctionReturn(0); 998 } 999 1000 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash) 1001 { 1002 PetscFunctionBegin; 1003 PetscCall(PetscSegBufferDestroy(&stash->segsendblocks)); 1004 PetscCall(PetscSegBufferDestroy(&stash->segrecvframe)); 1005 stash->recvframes = NULL; 1006 PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks)); 1007 if (stash->blocktype != MPI_DATATYPE_NULL) { 1008 PetscCallMPI(MPI_Type_free(&stash->blocktype)); 1009 } 1010 stash->nsendranks = 0; 1011 stash->nrecvranks = 0; 1012 PetscCall(PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes)); 1013 PetscCall(PetscFree(stash->sendreqs)); 1014 PetscCall(PetscFree(stash->recvreqs)); 1015 PetscCall(PetscFree(stash->recvranks)); 1016 PetscCall(PetscFree(stash->recvhdr)); 1017 PetscCall(PetscFree2(stash->some_indices,stash->some_statuses)); 1018 PetscFunctionReturn(0); 1019 } 1020 #endif 1021