1 2 #include <petsc/private/matimpl.h> 3 4 #define DEFAULT_STASH_SIZE 10000 5 6 static PetscErrorCode MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *); 7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *); 8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *); 9 #if !defined(PETSC_HAVE_MPIUNI) 10 static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *); 11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *); 12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *); 13 #endif 14 15 /* 16 MatStashCreate_Private - Creates a stash,currently used for all the parallel 17 matrix implementations. The stash is where elements of a matrix destined 18 to be stored on other processors are kept until matrix assembly is done. 19 20 This is a simple minded stash. Simply adds entries to end of stash. 21 22 Input Parameters: 23 comm - communicator, required for scatters. 24 bs - stash block size. used when stashing blocks of values 25 26 Output Parameters: 27 stash - the newly created stash 28 */ 29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash) 30 { 31 PetscInt max, *opt, nopt, i; 32 PetscBool flg; 33 34 PetscFunctionBegin; 35 /* Require 2 tags,get the second using PetscCommGetNewTag() */ 36 stash->comm = comm; 37 38 PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag1)); 39 PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag2)); 40 PetscCallMPI(MPI_Comm_size(stash->comm, &stash->size)); 41 PetscCallMPI(MPI_Comm_rank(stash->comm, &stash->rank)); 42 PetscCall(PetscMalloc1(2 * stash->size, &stash->flg_v)); 43 for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1; 44 45 nopt = stash->size; 46 PetscCall(PetscMalloc1(nopt, &opt)); 47 PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg)); 48 if (flg) { 49 if (nopt == 1) max = opt[0]; 50 else if (nopt == stash->size) max = opt[stash->rank]; 51 else if (stash->rank < nopt) max = opt[stash->rank]; 52 else max = 0; /* Use default */ 53 stash->umax = max; 54 } else { 55 stash->umax = 0; 56 } 57 PetscCall(PetscFree(opt)); 58 if (bs <= 0) bs = 1; 59 60 stash->bs = bs; 61 stash->nmax = 0; 62 stash->oldnmax = 0; 63 stash->n = 0; 64 stash->reallocs = -1; 65 stash->space_head = NULL; 66 stash->space = NULL; 67 68 stash->send_waits = NULL; 69 stash->recv_waits = NULL; 70 stash->send_status = NULL; 71 stash->nsends = 0; 72 stash->nrecvs = 0; 73 stash->svalues = NULL; 74 stash->rvalues = NULL; 75 stash->rindices = NULL; 76 stash->nprocessed = 0; 77 stash->reproduce = PETSC_FALSE; 78 stash->blocktype = MPI_DATATYPE_NULL; 79 80 PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL)); 81 #if !defined(PETSC_HAVE_MPIUNI) 82 flg = PETSC_FALSE; 83 PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL)); 84 if (!flg) { 85 stash->ScatterBegin = MatStashScatterBegin_BTS; 86 stash->ScatterGetMesg = MatStashScatterGetMesg_BTS; 87 stash->ScatterEnd = MatStashScatterEnd_BTS; 88 stash->ScatterDestroy = MatStashScatterDestroy_BTS; 89 } else { 90 #endif 91 stash->ScatterBegin = MatStashScatterBegin_Ref; 92 stash->ScatterGetMesg = MatStashScatterGetMesg_Ref; 93 stash->ScatterEnd = MatStashScatterEnd_Ref; 94 stash->ScatterDestroy = NULL; 95 #if !defined(PETSC_HAVE_MPIUNI) 96 } 97 #endif 98 PetscFunctionReturn(0); 99 } 100 101 /* 102 MatStashDestroy_Private - Destroy the stash 103 */ 104 PetscErrorCode MatStashDestroy_Private(MatStash *stash) 105 { 106 PetscFunctionBegin; 107 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 108 if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash)); 109 110 stash->space = NULL; 111 112 PetscCall(PetscFree(stash->flg_v)); 113 PetscFunctionReturn(0); 114 } 115 116 /* 117 MatStashScatterEnd_Private - This is called as the final stage of 118 scatter. The final stages of message passing is done here, and 119 all the memory used for message passing is cleaned up. This 120 routine also resets the stash, and deallocates the memory used 121 for the stash. It also keeps track of the current memory usage 122 so that the same value can be used the next time through. 123 */ 124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash) 125 { 126 PetscFunctionBegin; 127 PetscCall((*stash->ScatterEnd)(stash)); 128 PetscFunctionReturn(0); 129 } 130 131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash) 132 { 133 PetscInt nsends = stash->nsends, bs2, oldnmax, i; 134 MPI_Status *send_status; 135 136 PetscFunctionBegin; 137 for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1; 138 /* wait on sends */ 139 if (nsends) { 140 PetscCall(PetscMalloc1(2 * nsends, &send_status)); 141 PetscCallMPI(MPI_Waitall(2 * nsends, stash->send_waits, send_status)); 142 PetscCall(PetscFree(send_status)); 143 } 144 145 /* Now update nmaxold to be app 10% more than max n used, this way the 146 wastage of space is reduced the next time this stash is used. 147 Also update the oldmax, only if it increases */ 148 if (stash->n) { 149 bs2 = stash->bs * stash->bs; 150 oldnmax = ((int)(stash->n * 1.1) + 5) * bs2; 151 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 152 } 153 154 stash->nmax = 0; 155 stash->n = 0; 156 stash->reallocs = -1; 157 stash->nprocessed = 0; 158 159 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 160 161 stash->space = NULL; 162 163 PetscCall(PetscFree(stash->send_waits)); 164 PetscCall(PetscFree(stash->recv_waits)); 165 PetscCall(PetscFree2(stash->svalues, stash->sindices)); 166 PetscCall(PetscFree(stash->rvalues[0])); 167 PetscCall(PetscFree(stash->rvalues)); 168 PetscCall(PetscFree(stash->rindices[0])); 169 PetscCall(PetscFree(stash->rindices)); 170 PetscFunctionReturn(0); 171 } 172 173 /* 174 MatStashGetInfo_Private - Gets the relevant statistics of the stash 175 176 Input Parameters: 177 stash - the stash 178 nstash - the size of the stash. Indicates the number of values stored. 179 reallocs - the number of additional mallocs incurred. 180 181 */ 182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs) 183 { 184 PetscInt bs2 = stash->bs * stash->bs; 185 186 PetscFunctionBegin; 187 if (nstash) *nstash = stash->n * bs2; 188 if (reallocs) { 189 if (stash->reallocs < 0) *reallocs = 0; 190 else *reallocs = stash->reallocs; 191 } 192 PetscFunctionReturn(0); 193 } 194 195 /* 196 MatStashSetInitialSize_Private - Sets the initial size of the stash 197 198 Input Parameters: 199 stash - the stash 200 max - the value that is used as the max size of the stash. 201 this value is used while allocating memory. 202 */ 203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max) 204 { 205 PetscFunctionBegin; 206 stash->umax = max; 207 PetscFunctionReturn(0); 208 } 209 210 /* MatStashExpand_Private - Expand the stash. This function is called 211 when the space in the stash is not sufficient to add the new values 212 being inserted into the stash. 213 214 Input Parameters: 215 stash - the stash 216 incr - the minimum increase requested 217 218 Note: 219 This routine doubles the currently used memory. 220 */ 221 static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr) 222 { 223 PetscInt newnmax, bs2 = stash->bs * stash->bs; 224 225 PetscFunctionBegin; 226 /* allocate a larger stash */ 227 if (!stash->oldnmax && !stash->nmax) { /* new stash */ 228 if (stash->umax) newnmax = stash->umax / bs2; 229 else newnmax = DEFAULT_STASH_SIZE / bs2; 230 } else if (!stash->nmax) { /* resuing stash */ 231 if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2; 232 else newnmax = stash->oldnmax / bs2; 233 } else newnmax = stash->nmax * 2; 234 if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr; 235 236 /* Get a MatStashSpace and attach it to stash */ 237 PetscCall(PetscMatStashSpaceGet(bs2, newnmax, &stash->space)); 238 if (!stash->space_head) { /* new stash or resuing stash->oldnmax */ 239 stash->space_head = stash->space; 240 } 241 242 stash->reallocs++; 243 stash->nmax = newnmax; 244 PetscFunctionReturn(0); 245 } 246 /* 247 MatStashValuesRow_Private - inserts values into the stash. This function 248 expects the values to be roworiented. Multiple columns belong to the same row 249 can be inserted with a single call to this function. 250 251 Input Parameters: 252 stash - the stash 253 row - the global row correspoiding to the values 254 n - the number of elements inserted. All elements belong to the above row. 255 idxn - the global column indices corresponding to each of the values. 256 values - the values inserted 257 */ 258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries) 259 { 260 PetscInt i, k, cnt = 0; 261 PetscMatStashSpace space = stash->space; 262 263 PetscFunctionBegin; 264 /* Check and see if we have sufficient memory */ 265 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 266 space = stash->space; 267 k = space->local_used; 268 for (i = 0; i < n; i++) { 269 if (ignorezeroentries && values && values[i] == 0.0) continue; 270 space->idx[k] = row; 271 space->idy[k] = idxn[i]; 272 space->val[k] = values ? values[i] : 0.0; 273 k++; 274 cnt++; 275 } 276 stash->n += cnt; 277 space->local_used += cnt; 278 space->local_remaining -= cnt; 279 PetscFunctionReturn(0); 280 } 281 282 /* 283 MatStashValuesCol_Private - inserts values into the stash. This function 284 expects the values to be columnoriented. Multiple columns belong to the same row 285 can be inserted with a single call to this function. 286 287 Input Parameters: 288 stash - the stash 289 row - the global row correspoiding to the values 290 n - the number of elements inserted. All elements belong to the above row. 291 idxn - the global column indices corresponding to each of the values. 292 values - the values inserted 293 stepval - the consecutive values are sepated by a distance of stepval. 294 this happens because the input is columnoriented. 295 */ 296 PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries) 297 { 298 PetscInt i, k, cnt = 0; 299 PetscMatStashSpace space = stash->space; 300 301 PetscFunctionBegin; 302 /* Check and see if we have sufficient memory */ 303 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 304 space = stash->space; 305 k = space->local_used; 306 for (i = 0; i < n; i++) { 307 if (ignorezeroentries && values && values[i * stepval] == 0.0) continue; 308 space->idx[k] = row; 309 space->idy[k] = idxn[i]; 310 space->val[k] = values ? values[i * stepval] : 0.0; 311 k++; 312 cnt++; 313 } 314 stash->n += cnt; 315 space->local_used += cnt; 316 space->local_remaining -= cnt; 317 PetscFunctionReturn(0); 318 } 319 320 /* 321 MatStashValuesRowBlocked_Private - inserts blocks of values into the stash. 322 This function expects the values to be roworiented. Multiple columns belong 323 to the same block-row can be inserted with a single call to this function. 324 This function extracts the sub-block of values based on the dimensions of 325 the original input block, and the row,col values corresponding to the blocks. 326 327 Input Parameters: 328 stash - the stash 329 row - the global block-row correspoiding to the values 330 n - the number of elements inserted. All elements belong to the above row. 331 idxn - the global block-column indices corresponding to each of the blocks of 332 values. Each block is of size bs*bs. 333 values - the values inserted 334 rmax - the number of block-rows in the original block. 335 cmax - the number of block-columns on the original block. 336 idx - the index of the current block-row in the original block. 337 */ 338 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) 339 { 340 PetscInt i, j, k, bs2, bs = stash->bs, l; 341 const PetscScalar *vals; 342 PetscScalar *array; 343 PetscMatStashSpace space = stash->space; 344 345 PetscFunctionBegin; 346 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 347 space = stash->space; 348 l = space->local_used; 349 bs2 = bs * bs; 350 for (i = 0; i < n; i++) { 351 space->idx[l] = row; 352 space->idy[l] = idxn[i]; 353 /* Now copy over the block of values. Store the values column oriented. 354 This enables inserting multiple blocks belonging to a row with a single 355 funtion call */ 356 array = space->val + bs2 * l; 357 vals = values + idx * bs2 * n + bs * i; 358 for (j = 0; j < bs; j++) { 359 for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0; 360 array++; 361 vals += cmax * bs; 362 } 363 l++; 364 } 365 stash->n += n; 366 space->local_used += n; 367 space->local_remaining -= n; 368 PetscFunctionReturn(0); 369 } 370 371 /* 372 MatStashValuesColBlocked_Private - inserts blocks of values into the stash. 373 This function expects the values to be roworiented. Multiple columns belong 374 to the same block-row can be inserted with a single call to this function. 375 This function extracts the sub-block of values based on the dimensions of 376 the original input block, and the row,col values corresponding to the blocks. 377 378 Input Parameters: 379 stash - the stash 380 row - the global block-row correspoiding to the values 381 n - the number of elements inserted. All elements belong to the above row. 382 idxn - the global block-column indices corresponding to each of the blocks of 383 values. Each block is of size bs*bs. 384 values - the values inserted 385 rmax - the number of block-rows in the original block. 386 cmax - the number of block-columns on the original block. 387 idx - the index of the current block-row in the original block. 388 */ 389 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) 390 { 391 PetscInt i, j, k, bs2, bs = stash->bs, l; 392 const PetscScalar *vals; 393 PetscScalar *array; 394 PetscMatStashSpace space = stash->space; 395 396 PetscFunctionBegin; 397 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 398 space = stash->space; 399 l = space->local_used; 400 bs2 = bs * bs; 401 for (i = 0; i < n; i++) { 402 space->idx[l] = row; 403 space->idy[l] = idxn[i]; 404 /* Now copy over the block of values. Store the values column oriented. 405 This enables inserting multiple blocks belonging to a row with a single 406 funtion call */ 407 array = space->val + bs2 * l; 408 vals = values + idx * bs2 * n + bs * i; 409 for (j = 0; j < bs; j++) { 410 for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0; 411 array += bs; 412 vals += rmax * bs; 413 } 414 l++; 415 } 416 stash->n += n; 417 space->local_used += n; 418 space->local_remaining -= n; 419 PetscFunctionReturn(0); 420 } 421 /* 422 MatStashScatterBegin_Private - Initiates the transfer of values to the 423 correct owners. This function goes through the stash, and check the 424 owners of each stashed value, and sends the values off to the owner 425 processors. 426 427 Input Parameters: 428 stash - the stash 429 owners - an array of size 'no-of-procs' which gives the ownership range 430 for each node. 431 432 Note: 433 The 'owners' array in the cased of the blocked-stash has the 434 ranges specified blocked global indices, and for the regular stash in 435 the proper global indices. 436 */ 437 PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners) 438 { 439 PetscFunctionBegin; 440 PetscCall((*stash->ScatterBegin)(mat, stash, owners)); 441 PetscFunctionReturn(0); 442 } 443 444 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners) 445 { 446 PetscInt *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2; 447 PetscInt size = stash->size, nsends; 448 PetscInt count, *sindices, **rindices, i, j, idx, lastidx, l; 449 PetscScalar **rvalues, *svalues; 450 MPI_Comm comm = stash->comm; 451 MPI_Request *send_waits, *recv_waits, *recv_waits1, *recv_waits2; 452 PetscMPIInt *sizes, *nlengths, nreceives; 453 PetscInt *sp_idx, *sp_idy; 454 PetscScalar *sp_val; 455 PetscMatStashSpace space, space_next; 456 457 PetscFunctionBegin; 458 { /* make sure all processors are either in INSERTMODE or ADDMODE */ 459 InsertMode addv; 460 PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat))); 461 PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added"); 462 mat->insertmode = addv; /* in case this processor had no cache */ 463 } 464 465 bs2 = stash->bs * stash->bs; 466 467 /* first count number of contributors to each processor */ 468 PetscCall(PetscCalloc1(size, &nlengths)); 469 PetscCall(PetscMalloc1(stash->n + 1, &owner)); 470 471 i = j = 0; 472 lastidx = -1; 473 space = stash->space_head; 474 while (space) { 475 space_next = space->next; 476 sp_idx = space->idx; 477 for (l = 0; l < space->local_used; l++) { 478 /* if indices are NOT locally sorted, need to start search at the beginning */ 479 if (lastidx > (idx = sp_idx[l])) j = 0; 480 lastidx = idx; 481 for (; j < size; j++) { 482 if (idx >= owners[j] && idx < owners[j + 1]) { 483 nlengths[j]++; 484 owner[i] = j; 485 break; 486 } 487 } 488 i++; 489 } 490 space = space_next; 491 } 492 493 /* Now check what procs get messages - and compute nsends. */ 494 PetscCall(PetscCalloc1(size, &sizes)); 495 for (i = 0, nsends = 0; i < size; i++) { 496 if (nlengths[i]) { 497 sizes[i] = 1; 498 nsends++; 499 } 500 } 501 502 { 503 PetscMPIInt *onodes, *olengths; 504 /* Determine the number of messages to expect, their lengths, from from-ids */ 505 PetscCall(PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives)); 506 PetscCall(PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths)); 507 /* since clubbing row,col - lengths are multiplied by 2 */ 508 for (i = 0; i < nreceives; i++) olengths[i] *= 2; 509 PetscCall(PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1)); 510 /* values are size 'bs2' lengths (and remove earlier factor 2 */ 511 for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2; 512 PetscCall(PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2)); 513 PetscCall(PetscFree(onodes)); 514 PetscCall(PetscFree(olengths)); 515 } 516 517 /* do sends: 518 1) starts[i] gives the starting index in svalues for stuff going to 519 the ith processor 520 */ 521 PetscCall(PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices)); 522 PetscCall(PetscMalloc1(2 * nsends, &send_waits)); 523 PetscCall(PetscMalloc2(size, &startv, size, &starti)); 524 /* use 2 sends the first with all_a, the next with all_i and all_j */ 525 startv[0] = 0; 526 starti[0] = 0; 527 for (i = 1; i < size; i++) { 528 startv[i] = startv[i - 1] + nlengths[i - 1]; 529 starti[i] = starti[i - 1] + 2 * nlengths[i - 1]; 530 } 531 532 i = 0; 533 space = stash->space_head; 534 while (space) { 535 space_next = space->next; 536 sp_idx = space->idx; 537 sp_idy = space->idy; 538 sp_val = space->val; 539 for (l = 0; l < space->local_used; l++) { 540 j = owner[i]; 541 if (bs2 == 1) { 542 svalues[startv[j]] = sp_val[l]; 543 } else { 544 PetscInt k; 545 PetscScalar *buf1, *buf2; 546 buf1 = svalues + bs2 * startv[j]; 547 buf2 = space->val + bs2 * l; 548 for (k = 0; k < bs2; k++) buf1[k] = buf2[k]; 549 } 550 sindices[starti[j]] = sp_idx[l]; 551 sindices[starti[j] + nlengths[j]] = sp_idy[l]; 552 startv[j]++; 553 starti[j]++; 554 i++; 555 } 556 space = space_next; 557 } 558 startv[0] = 0; 559 for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1]; 560 561 for (i = 0, count = 0; i < size; i++) { 562 if (sizes[i]) { 563 PetscCallMPI(MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++)); 564 PetscCallMPI(MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++)); 565 } 566 } 567 #if defined(PETSC_USE_INFO) 568 PetscCall(PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends)); 569 for (i = 0; i < size; i++) { 570 if (sizes[i]) PetscCall(PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt))))); 571 } 572 #endif 573 PetscCall(PetscFree(nlengths)); 574 PetscCall(PetscFree(owner)); 575 PetscCall(PetscFree2(startv, starti)); 576 PetscCall(PetscFree(sizes)); 577 578 /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */ 579 PetscCall(PetscMalloc1(2 * nreceives, &recv_waits)); 580 581 for (i = 0; i < nreceives; i++) { 582 recv_waits[2 * i] = recv_waits1[i]; 583 recv_waits[2 * i + 1] = recv_waits2[i]; 584 } 585 stash->recv_waits = recv_waits; 586 587 PetscCall(PetscFree(recv_waits1)); 588 PetscCall(PetscFree(recv_waits2)); 589 590 stash->svalues = svalues; 591 stash->sindices = sindices; 592 stash->rvalues = rvalues; 593 stash->rindices = rindices; 594 stash->send_waits = send_waits; 595 stash->nsends = nsends; 596 stash->nrecvs = nreceives; 597 stash->reproduce_count = 0; 598 PetscFunctionReturn(0); 599 } 600 601 /* 602 MatStashScatterGetMesg_Private - This function waits on the receives posted 603 in the function MatStashScatterBegin_Private() and returns one message at 604 a time to the calling function. If no messages are left, it indicates this 605 by setting flg = 0, else it sets flg = 1. 606 607 Input Parameters: 608 stash - the stash 609 610 Output Parameters: 611 nvals - the number of entries in the current message. 612 rows - an array of row indices (or blocked indices) corresponding to the values 613 cols - an array of columnindices (or blocked indices) corresponding to the values 614 vals - the values 615 flg - 0 indicates no more message left, and the current call has no values associated. 616 1 indicates that the current call successfully received a message, and the 617 other output parameters nvals,rows,cols,vals are set appropriately. 618 */ 619 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) 620 { 621 PetscFunctionBegin; 622 PetscCall((*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg)); 623 PetscFunctionReturn(0); 624 } 625 626 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) 627 { 628 PetscMPIInt i, *flg_v = stash->flg_v, i1, i2; 629 PetscInt bs2; 630 MPI_Status recv_status; 631 PetscBool match_found = PETSC_FALSE; 632 633 PetscFunctionBegin; 634 *flg = 0; /* When a message is discovered this is reset to 1 */ 635 /* Return if no more messages to process */ 636 if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0); 637 638 bs2 = stash->bs * stash->bs; 639 /* If a matching pair of receives are found, process them, and return the data to 640 the calling function. Until then keep receiving messages */ 641 while (!match_found) { 642 if (stash->reproduce) { 643 i = stash->reproduce_count++; 644 PetscCallMPI(MPI_Wait(stash->recv_waits + i, &recv_status)); 645 } else { 646 PetscCallMPI(MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status)); 647 } 648 PetscCheck(recv_status.MPI_SOURCE >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Negative MPI source!"); 649 650 /* Now pack the received message into a structure which is usable by others */ 651 if (i % 2) { 652 PetscCallMPI(MPI_Get_count(&recv_status, MPIU_SCALAR, nvals)); 653 flg_v[2 * recv_status.MPI_SOURCE] = i / 2; 654 *nvals = *nvals / bs2; 655 } else { 656 PetscCallMPI(MPI_Get_count(&recv_status, MPIU_INT, nvals)); 657 flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2; 658 *nvals = *nvals / 2; /* This message has both row indices and col indices */ 659 } 660 661 /* Check if we have both messages from this proc */ 662 i1 = flg_v[2 * recv_status.MPI_SOURCE]; 663 i2 = flg_v[2 * recv_status.MPI_SOURCE + 1]; 664 if (i1 != -1 && i2 != -1) { 665 *rows = stash->rindices[i2]; 666 *cols = *rows + *nvals; 667 *vals = stash->rvalues[i1]; 668 *flg = 1; 669 stash->nprocessed++; 670 match_found = PETSC_TRUE; 671 } 672 } 673 PetscFunctionReturn(0); 674 } 675 676 #if !defined(PETSC_HAVE_MPIUNI) 677 typedef struct { 678 PetscInt row; 679 PetscInt col; 680 PetscScalar vals[1]; /* Actually an array of length bs2 */ 681 } MatStashBlock; 682 683 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode) 684 { 685 PetscMatStashSpace space; 686 PetscInt n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i; 687 PetscScalar **valptr; 688 689 PetscFunctionBegin; 690 PetscCall(PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm)); 691 for (space = stash->space_head, cnt = 0; space; space = space->next) { 692 for (i = 0; i < space->local_used; i++) { 693 row[cnt] = space->idx[i]; 694 col[cnt] = space->idy[i]; 695 valptr[cnt] = &space->val[i * bs2]; 696 perm[cnt] = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */ 697 cnt++; 698 } 699 } 700 PetscCheck(cnt == n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries", n, cnt); 701 PetscCall(PetscSortIntWithArrayPair(n, row, col, perm)); 702 /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */ 703 for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) { 704 if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */ 705 PetscInt colstart; 706 PetscCall(PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart])); 707 for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */ 708 PetscInt j, l; 709 MatStashBlock *block; 710 PetscCall(PetscSegBufferGet(stash->segsendblocks, 1, &block)); 711 block->row = row[rowstart]; 712 block->col = col[colstart]; 713 PetscCall(PetscArraycpy(block->vals, valptr[perm[colstart]], bs2)); 714 for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */ 715 if (insertmode == ADD_VALUES) { 716 for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l]; 717 } else { 718 PetscCall(PetscArraycpy(block->vals, valptr[perm[j]], bs2)); 719 } 720 } 721 colstart = j; 722 } 723 rowstart = i; 724 } 725 } 726 PetscCall(PetscFree4(row, col, valptr, perm)); 727 PetscFunctionReturn(0); 728 } 729 730 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) 731 { 732 PetscFunctionBegin; 733 if (stash->blocktype == MPI_DATATYPE_NULL) { 734 PetscInt bs2 = PetscSqr(stash->bs); 735 PetscMPIInt blocklens[2]; 736 MPI_Aint displs[2]; 737 MPI_Datatype types[2], stype; 738 /* 739 DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex. 740 std::complex itself has standard layout, so does DummyBlock, recursively. 741 To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout, 742 though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++. 743 offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof. 744 745 We can test if std::complex has standard layout with the following code: 746 #include <iostream> 747 #include <type_traits> 748 #include <complex> 749 int main() { 750 std::cout << std::boolalpha; 751 std::cout << std::is_standard_layout<std::complex<double> >::value << '\n'; 752 } 753 Output: true 754 */ 755 struct DummyBlock { 756 PetscInt row, col; 757 PetscScalar vals; 758 }; 759 760 stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar); 761 if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */ 762 stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt); 763 } 764 PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks)); 765 PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks)); 766 PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe)); 767 blocklens[0] = 2; 768 blocklens[1] = bs2; 769 displs[0] = offsetof(struct DummyBlock, row); 770 displs[1] = offsetof(struct DummyBlock, vals); 771 types[0] = MPIU_INT; 772 types[1] = MPIU_SCALAR; 773 PetscCallMPI(MPI_Type_create_struct(2, blocklens, displs, types, &stype)); 774 PetscCallMPI(MPI_Type_commit(&stype)); 775 PetscCallMPI(MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype)); 776 PetscCallMPI(MPI_Type_commit(&stash->blocktype)); 777 PetscCallMPI(MPI_Type_free(&stype)); 778 } 779 PetscFunctionReturn(0); 780 } 781 782 /* Callback invoked after target rank has initiatied receive of rendezvous message. 783 * Here we post the main sends. 784 */ 785 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx) 786 { 787 MatStash *stash = (MatStash *)ctx; 788 MatStashHeader *hdr = (MatStashHeader *)sdata; 789 790 PetscFunctionBegin; 791 PetscCheck(rank == stash->sendranks[rankid], comm, PETSC_ERR_PLIB, "BTS Send rank %d does not match sendranks[%d] %d", rank, rankid, stash->sendranks[rankid]); 792 PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0])); 793 stash->sendframes[rankid].count = hdr->count; 794 stash->sendframes[rankid].pending = 1; 795 PetscFunctionReturn(0); 796 } 797 798 /* 799 Callback invoked by target after receiving rendezvous message. 800 Here we post the main recvs. 801 */ 802 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx) 803 { 804 MatStash *stash = (MatStash *)ctx; 805 MatStashHeader *hdr = (MatStashHeader *)rdata; 806 MatStashFrame *frame; 807 808 PetscFunctionBegin; 809 PetscCall(PetscSegBufferGet(stash->segrecvframe, 1, &frame)); 810 PetscCall(PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer)); 811 PetscCallMPI(MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0])); 812 frame->count = hdr->count; 813 frame->pending = 1; 814 PetscFunctionReturn(0); 815 } 816 817 /* 818 * owners[] contains the ownership ranges; may be indexed by either blocks or scalars 819 */ 820 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[]) 821 { 822 size_t nblocks; 823 char *sendblocks; 824 825 PetscFunctionBegin; 826 if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */ 827 InsertMode addv; 828 PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat))); 829 PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added"); 830 } 831 832 PetscCall(MatStashBlockTypeSetUp(stash)); 833 PetscCall(MatStashSortCompress_Private(stash, mat->insertmode)); 834 PetscCall(PetscSegBufferGetSize(stash->segsendblocks, &nblocks)); 835 PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks)); 836 if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */ 837 PetscInt i; 838 size_t b; 839 for (i = 0, b = 0; i < stash->nsendranks; i++) { 840 stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size]; 841 /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */ 842 stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */ 843 for (; b < nblocks; b++) { 844 MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size]; 845 PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]], stash->comm, PETSC_ERR_ARG_WRONG, "MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly", sendblock_b->row, stash->sendranks[i]); 846 if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break; 847 stash->sendhdr[i].count++; 848 } 849 } 850 } else { /* Dynamically count and pack (first time) */ 851 PetscInt sendno; 852 size_t i, rowstart; 853 854 /* Count number of send ranks and allocate for sends */ 855 stash->nsendranks = 0; 856 for (rowstart = 0; rowstart < nblocks;) { 857 PetscInt owner; 858 MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size]; 859 PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner)); 860 if (owner < 0) owner = -(owner + 2); 861 for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 862 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 863 if (sendblock_i->row >= owners[owner + 1]) break; 864 } 865 stash->nsendranks++; 866 rowstart = i; 867 } 868 PetscCall(PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes)); 869 870 /* Set up sendhdrs and sendframes */ 871 sendno = 0; 872 for (rowstart = 0; rowstart < nblocks;) { 873 PetscInt owner; 874 MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size]; 875 PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner)); 876 if (owner < 0) owner = -(owner + 2); 877 stash->sendranks[sendno] = owner; 878 for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 879 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 880 if (sendblock_i->row >= owners[owner + 1]) break; 881 } 882 stash->sendframes[sendno].buffer = sendblock_rowstart; 883 stash->sendframes[sendno].pending = 0; 884 stash->sendhdr[sendno].count = i - rowstart; 885 sendno++; 886 rowstart = i; 887 } 888 PetscCheck(sendno == stash->nsendranks, stash->comm, PETSC_ERR_PLIB, "BTS counted %d sendranks, but %" PetscInt_FMT " sends", stash->nsendranks, sendno); 889 } 890 891 /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new 892 * message or a dummy entry of some sort. */ 893 if (mat->insertmode == INSERT_VALUES) { 894 size_t i; 895 for (i = 0; i < nblocks; i++) { 896 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 897 sendblock_i->row = -(sendblock_i->row + 1); 898 } 899 } 900 901 if (stash->first_assembly_done) { 902 PetscMPIInt i, tag; 903 PetscCall(PetscCommGetNewTag(stash->comm, &tag)); 904 for (i = 0; i < stash->nrecvranks; i++) PetscCall(MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash)); 905 for (i = 0; i < stash->nsendranks; i++) PetscCall(MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash)); 906 stash->use_status = PETSC_TRUE; /* Use count from message status. */ 907 } else { 908 PetscCall(PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash)); 909 PetscCall(PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses)); 910 stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */ 911 } 912 913 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes)); 914 stash->recvframe_active = NULL; 915 stash->recvframe_i = 0; 916 stash->some_i = 0; 917 stash->some_count = 0; 918 stash->recvcount = 0; 919 stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */ 920 stash->insertmode = &mat->insertmode; 921 PetscFunctionReturn(0); 922 } 923 924 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg) 925 { 926 MatStashBlock *block; 927 928 PetscFunctionBegin; 929 *flg = 0; 930 while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) { 931 if (stash->some_i == stash->some_count) { 932 if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */ 933 PetscCallMPI(MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE)); 934 stash->some_i = 0; 935 } 936 stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]]; 937 stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */ 938 if (stash->use_status) { /* Count what was actually sent */ 939 PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count)); 940 } 941 if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */ 942 block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0]; 943 if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES; 944 PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling INSERT_VALUES, but rank %d requested ADD_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]); 945 PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling ADD_VALUES, but rank %d requested INSERT_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]); 946 } 947 stash->some_i++; 948 stash->recvcount++; 949 stash->recvframe_i = 0; 950 } 951 *n = 1; 952 block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size]; 953 if (block->row < 0) block->row = -(block->row + 1); 954 *row = &block->row; 955 *col = &block->col; 956 *val = block->vals; 957 stash->recvframe_i++; 958 *flg = 1; 959 PetscFunctionReturn(0); 960 } 961 962 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash) 963 { 964 PetscFunctionBegin; 965 PetscCallMPI(MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE)); 966 if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks */ 967 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL)); 968 } else { /* No reuse, so collect everything. */ 969 PetscCall(MatStashScatterDestroy_BTS(stash)); 970 } 971 972 /* Now update nmaxold to be app 10% more than max n used, this way the 973 wastage of space is reduced the next time this stash is used. 974 Also update the oldmax, only if it increases */ 975 if (stash->n) { 976 PetscInt bs2 = stash->bs * stash->bs; 977 PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2; 978 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 979 } 980 981 stash->nmax = 0; 982 stash->n = 0; 983 stash->reallocs = -1; 984 stash->nprocessed = 0; 985 986 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 987 988 stash->space = NULL; 989 990 PetscFunctionReturn(0); 991 } 992 993 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash) 994 { 995 PetscFunctionBegin; 996 PetscCall(PetscSegBufferDestroy(&stash->segsendblocks)); 997 PetscCall(PetscSegBufferDestroy(&stash->segrecvframe)); 998 stash->recvframes = NULL; 999 PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks)); 1000 if (stash->blocktype != MPI_DATATYPE_NULL) PetscCallMPI(MPI_Type_free(&stash->blocktype)); 1001 stash->nsendranks = 0; 1002 stash->nrecvranks = 0; 1003 PetscCall(PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes)); 1004 PetscCall(PetscFree(stash->sendreqs)); 1005 PetscCall(PetscFree(stash->recvreqs)); 1006 PetscCall(PetscFree(stash->recvranks)); 1007 PetscCall(PetscFree(stash->recvhdr)); 1008 PetscCall(PetscFree2(stash->some_indices, stash->some_statuses)); 1009 PetscFunctionReturn(0); 1010 } 1011 #endif 1012