1 2 #include <petsc/private/matimpl.h> 3 4 #define DEFAULT_STASH_SIZE 10000 5 6 static PetscErrorCode MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *); 7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *); 8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *); 9 #if !defined(PETSC_HAVE_MPIUNI) 10 static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *); 11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *); 12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *); 13 #endif 14 15 /* 16 MatStashCreate_Private - Creates a stash,currently used for all the parallel 17 matrix implementations. The stash is where elements of a matrix destined 18 to be stored on other processors are kept until matrix assembly is done. 19 20 This is a simple minded stash. Simply adds entries to end of stash. 21 22 Input Parameters: 23 comm - communicator, required for scatters. 24 bs - stash block size. used when stashing blocks of values 25 26 Output Parameters: 27 stash - the newly created stash 28 */ 29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash) { 30 PetscInt max, *opt, nopt, i; 31 PetscBool flg; 32 33 PetscFunctionBegin; 34 /* Require 2 tags,get the second using PetscCommGetNewTag() */ 35 stash->comm = comm; 36 37 PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag1)); 38 PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag2)); 39 PetscCallMPI(MPI_Comm_size(stash->comm, &stash->size)); 40 PetscCallMPI(MPI_Comm_rank(stash->comm, &stash->rank)); 41 PetscCall(PetscMalloc1(2 * stash->size, &stash->flg_v)); 42 for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1; 43 44 nopt = stash->size; 45 PetscCall(PetscMalloc1(nopt, &opt)); 46 PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg)); 47 if (flg) { 48 if (nopt == 1) max = opt[0]; 49 else if (nopt == stash->size) max = opt[stash->rank]; 50 else if (stash->rank < nopt) max = opt[stash->rank]; 51 else max = 0; /* Use default */ 52 stash->umax = max; 53 } else { 54 stash->umax = 0; 55 } 56 PetscCall(PetscFree(opt)); 57 if (bs <= 0) bs = 1; 58 59 stash->bs = bs; 60 stash->nmax = 0; 61 stash->oldnmax = 0; 62 stash->n = 0; 63 stash->reallocs = -1; 64 stash->space_head = NULL; 65 stash->space = NULL; 66 67 stash->send_waits = NULL; 68 stash->recv_waits = NULL; 69 stash->send_status = NULL; 70 stash->nsends = 0; 71 stash->nrecvs = 0; 72 stash->svalues = NULL; 73 stash->rvalues = NULL; 74 stash->rindices = NULL; 75 stash->nprocessed = 0; 76 stash->reproduce = PETSC_FALSE; 77 stash->blocktype = MPI_DATATYPE_NULL; 78 79 PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL)); 80 #if !defined(PETSC_HAVE_MPIUNI) 81 flg = PETSC_FALSE; 82 PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL)); 83 if (!flg) { 84 stash->ScatterBegin = MatStashScatterBegin_BTS; 85 stash->ScatterGetMesg = MatStashScatterGetMesg_BTS; 86 stash->ScatterEnd = MatStashScatterEnd_BTS; 87 stash->ScatterDestroy = MatStashScatterDestroy_BTS; 88 } else { 89 #endif 90 stash->ScatterBegin = MatStashScatterBegin_Ref; 91 stash->ScatterGetMesg = MatStashScatterGetMesg_Ref; 92 stash->ScatterEnd = MatStashScatterEnd_Ref; 93 stash->ScatterDestroy = NULL; 94 #if !defined(PETSC_HAVE_MPIUNI) 95 } 96 #endif 97 PetscFunctionReturn(0); 98 } 99 100 /* 101 MatStashDestroy_Private - Destroy the stash 102 */ 103 PetscErrorCode MatStashDestroy_Private(MatStash *stash) { 104 PetscFunctionBegin; 105 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 106 if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash)); 107 108 stash->space = NULL; 109 110 PetscCall(PetscFree(stash->flg_v)); 111 PetscFunctionReturn(0); 112 } 113 114 /* 115 MatStashScatterEnd_Private - This is called as the final stage of 116 scatter. The final stages of message passing is done here, and 117 all the memory used for message passing is cleaned up. This 118 routine also resets the stash, and deallocates the memory used 119 for the stash. It also keeps track of the current memory usage 120 so that the same value can be used the next time through. 121 */ 122 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash) { 123 PetscFunctionBegin; 124 PetscCall((*stash->ScatterEnd)(stash)); 125 PetscFunctionReturn(0); 126 } 127 128 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash) { 129 PetscInt nsends = stash->nsends, bs2, oldnmax, i; 130 MPI_Status *send_status; 131 132 PetscFunctionBegin; 133 for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1; 134 /* wait on sends */ 135 if (nsends) { 136 PetscCall(PetscMalloc1(2 * nsends, &send_status)); 137 PetscCallMPI(MPI_Waitall(2 * nsends, stash->send_waits, send_status)); 138 PetscCall(PetscFree(send_status)); 139 } 140 141 /* Now update nmaxold to be app 10% more than max n used, this way the 142 wastage of space is reduced the next time this stash is used. 143 Also update the oldmax, only if it increases */ 144 if (stash->n) { 145 bs2 = stash->bs * stash->bs; 146 oldnmax = ((int)(stash->n * 1.1) + 5) * bs2; 147 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 148 } 149 150 stash->nmax = 0; 151 stash->n = 0; 152 stash->reallocs = -1; 153 stash->nprocessed = 0; 154 155 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 156 157 stash->space = NULL; 158 159 PetscCall(PetscFree(stash->send_waits)); 160 PetscCall(PetscFree(stash->recv_waits)); 161 PetscCall(PetscFree2(stash->svalues, stash->sindices)); 162 PetscCall(PetscFree(stash->rvalues[0])); 163 PetscCall(PetscFree(stash->rvalues)); 164 PetscCall(PetscFree(stash->rindices[0])); 165 PetscCall(PetscFree(stash->rindices)); 166 PetscFunctionReturn(0); 167 } 168 169 /* 170 MatStashGetInfo_Private - Gets the relevant statistics of the stash 171 172 Input Parameters: 173 stash - the stash 174 nstash - the size of the stash. Indicates the number of values stored. 175 reallocs - the number of additional mallocs incurred. 176 177 */ 178 PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs) { 179 PetscInt bs2 = stash->bs * stash->bs; 180 181 PetscFunctionBegin; 182 if (nstash) *nstash = stash->n * bs2; 183 if (reallocs) { 184 if (stash->reallocs < 0) *reallocs = 0; 185 else *reallocs = stash->reallocs; 186 } 187 PetscFunctionReturn(0); 188 } 189 190 /* 191 MatStashSetInitialSize_Private - Sets the initial size of the stash 192 193 Input Parameters: 194 stash - the stash 195 max - the value that is used as the max size of the stash. 196 this value is used while allocating memory. 197 */ 198 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max) { 199 PetscFunctionBegin; 200 stash->umax = max; 201 PetscFunctionReturn(0); 202 } 203 204 /* MatStashExpand_Private - Expand the stash. This function is called 205 when the space in the stash is not sufficient to add the new values 206 being inserted into the stash. 207 208 Input Parameters: 209 stash - the stash 210 incr - the minimum increase requested 211 212 Note: 213 This routine doubles the currently used memory. 214 */ 215 static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr) { 216 PetscInt newnmax, bs2 = stash->bs * stash->bs; 217 218 PetscFunctionBegin; 219 /* allocate a larger stash */ 220 if (!stash->oldnmax && !stash->nmax) { /* new stash */ 221 if (stash->umax) newnmax = stash->umax / bs2; 222 else newnmax = DEFAULT_STASH_SIZE / bs2; 223 } else if (!stash->nmax) { /* resuing stash */ 224 if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2; 225 else newnmax = stash->oldnmax / bs2; 226 } else newnmax = stash->nmax * 2; 227 if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr; 228 229 /* Get a MatStashSpace and attach it to stash */ 230 PetscCall(PetscMatStashSpaceGet(bs2, newnmax, &stash->space)); 231 if (!stash->space_head) { /* new stash or resuing stash->oldnmax */ 232 stash->space_head = stash->space; 233 } 234 235 stash->reallocs++; 236 stash->nmax = newnmax; 237 PetscFunctionReturn(0); 238 } 239 /* 240 MatStashValuesRow_Private - inserts values into the stash. This function 241 expects the values to be roworiented. Multiple columns belong to the same row 242 can be inserted with a single call to this function. 243 244 Input Parameters: 245 stash - the stash 246 row - the global row correspoiding to the values 247 n - the number of elements inserted. All elements belong to the above row. 248 idxn - the global column indices corresponding to each of the values. 249 values - the values inserted 250 */ 251 PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries) { 252 PetscInt i, k, cnt = 0; 253 PetscMatStashSpace space = stash->space; 254 255 PetscFunctionBegin; 256 /* Check and see if we have sufficient memory */ 257 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 258 space = stash->space; 259 k = space->local_used; 260 for (i = 0; i < n; i++) { 261 if (ignorezeroentries && values && values[i] == 0.0) continue; 262 space->idx[k] = row; 263 space->idy[k] = idxn[i]; 264 space->val[k] = values ? values[i] : 0.0; 265 k++; 266 cnt++; 267 } 268 stash->n += cnt; 269 space->local_used += cnt; 270 space->local_remaining -= cnt; 271 PetscFunctionReturn(0); 272 } 273 274 /* 275 MatStashValuesCol_Private - inserts values into the stash. This function 276 expects the values to be columnoriented. Multiple columns belong to the same row 277 can be inserted with a single call to this function. 278 279 Input Parameters: 280 stash - the stash 281 row - the global row correspoiding to the values 282 n - the number of elements inserted. All elements belong to the above row. 283 idxn - the global column indices corresponding to each of the values. 284 values - the values inserted 285 stepval - the consecutive values are sepated by a distance of stepval. 286 this happens because the input is columnoriented. 287 */ 288 PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries) { 289 PetscInt i, k, cnt = 0; 290 PetscMatStashSpace space = stash->space; 291 292 PetscFunctionBegin; 293 /* Check and see if we have sufficient memory */ 294 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 295 space = stash->space; 296 k = space->local_used; 297 for (i = 0; i < n; i++) { 298 if (ignorezeroentries && values && values[i * stepval] == 0.0) continue; 299 space->idx[k] = row; 300 space->idy[k] = idxn[i]; 301 space->val[k] = values ? values[i * stepval] : 0.0; 302 k++; 303 cnt++; 304 } 305 stash->n += cnt; 306 space->local_used += cnt; 307 space->local_remaining -= cnt; 308 PetscFunctionReturn(0); 309 } 310 311 /* 312 MatStashValuesRowBlocked_Private - inserts blocks of values into the stash. 313 This function expects the values to be roworiented. Multiple columns belong 314 to the same block-row can be inserted with a single call to this function. 315 This function extracts the sub-block of values based on the dimensions of 316 the original input block, and the row,col values corresponding to the blocks. 317 318 Input Parameters: 319 stash - the stash 320 row - the global block-row correspoiding to the values 321 n - the number of elements inserted. All elements belong to the above row. 322 idxn - the global block-column indices corresponding to each of the blocks of 323 values. Each block is of size bs*bs. 324 values - the values inserted 325 rmax - the number of block-rows in the original block. 326 cmax - the number of block-columns on the original block. 327 idx - the index of the current block-row in the original block. 328 */ 329 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) { 330 PetscInt i, j, k, bs2, bs = stash->bs, l; 331 const PetscScalar *vals; 332 PetscScalar *array; 333 PetscMatStashSpace space = stash->space; 334 335 PetscFunctionBegin; 336 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 337 space = stash->space; 338 l = space->local_used; 339 bs2 = bs * bs; 340 for (i = 0; i < n; i++) { 341 space->idx[l] = row; 342 space->idy[l] = idxn[i]; 343 /* Now copy over the block of values. Store the values column oriented. 344 This enables inserting multiple blocks belonging to a row with a single 345 funtion call */ 346 array = space->val + bs2 * l; 347 vals = values + idx * bs2 * n + bs * i; 348 for (j = 0; j < bs; j++) { 349 for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0; 350 array++; 351 vals += cmax * bs; 352 } 353 l++; 354 } 355 stash->n += n; 356 space->local_used += n; 357 space->local_remaining -= n; 358 PetscFunctionReturn(0); 359 } 360 361 /* 362 MatStashValuesColBlocked_Private - inserts blocks of values into the stash. 363 This function expects the values to be roworiented. Multiple columns belong 364 to the same block-row can be inserted with a single call to this function. 365 This function extracts the sub-block of values based on the dimensions of 366 the original input block, and the row,col values corresponding to the blocks. 367 368 Input Parameters: 369 stash - the stash 370 row - the global block-row correspoiding to the values 371 n - the number of elements inserted. All elements belong to the above row. 372 idxn - the global block-column indices corresponding to each of the blocks of 373 values. Each block is of size bs*bs. 374 values - the values inserted 375 rmax - the number of block-rows in the original block. 376 cmax - the number of block-columns on the original block. 377 idx - the index of the current block-row in the original block. 378 */ 379 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) { 380 PetscInt i, j, k, bs2, bs = stash->bs, l; 381 const PetscScalar *vals; 382 PetscScalar *array; 383 PetscMatStashSpace space = stash->space; 384 385 PetscFunctionBegin; 386 if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n)); 387 space = stash->space; 388 l = space->local_used; 389 bs2 = bs * bs; 390 for (i = 0; i < n; i++) { 391 space->idx[l] = row; 392 space->idy[l] = idxn[i]; 393 /* Now copy over the block of values. Store the values column oriented. 394 This enables inserting multiple blocks belonging to a row with a single 395 funtion call */ 396 array = space->val + bs2 * l; 397 vals = values + idx * bs2 * n + bs * i; 398 for (j = 0; j < bs; j++) { 399 for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0; 400 array += bs; 401 vals += rmax * bs; 402 } 403 l++; 404 } 405 stash->n += n; 406 space->local_used += n; 407 space->local_remaining -= n; 408 PetscFunctionReturn(0); 409 } 410 /* 411 MatStashScatterBegin_Private - Initiates the transfer of values to the 412 correct owners. This function goes through the stash, and check the 413 owners of each stashed value, and sends the values off to the owner 414 processors. 415 416 Input Parameters: 417 stash - the stash 418 owners - an array of size 'no-of-procs' which gives the ownership range 419 for each node. 420 421 Note: 422 The 'owners' array in the cased of the blocked-stash has the 423 ranges specified blocked global indices, and for the regular stash in 424 the proper global indices. 425 */ 426 PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners) { 427 PetscFunctionBegin; 428 PetscCall((*stash->ScatterBegin)(mat, stash, owners)); 429 PetscFunctionReturn(0); 430 } 431 432 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners) { 433 PetscInt *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2; 434 PetscInt size = stash->size, nsends; 435 PetscInt count, *sindices, **rindices, i, j, idx, lastidx, l; 436 PetscScalar **rvalues, *svalues; 437 MPI_Comm comm = stash->comm; 438 MPI_Request *send_waits, *recv_waits, *recv_waits1, *recv_waits2; 439 PetscMPIInt *sizes, *nlengths, nreceives; 440 PetscInt *sp_idx, *sp_idy; 441 PetscScalar *sp_val; 442 PetscMatStashSpace space, space_next; 443 444 PetscFunctionBegin; 445 { /* make sure all processors are either in INSERTMODE or ADDMODE */ 446 InsertMode addv; 447 PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat))); 448 PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added"); 449 mat->insertmode = addv; /* in case this processor had no cache */ 450 } 451 452 bs2 = stash->bs * stash->bs; 453 454 /* first count number of contributors to each processor */ 455 PetscCall(PetscCalloc1(size, &nlengths)); 456 PetscCall(PetscMalloc1(stash->n + 1, &owner)); 457 458 i = j = 0; 459 lastidx = -1; 460 space = stash->space_head; 461 while (space) { 462 space_next = space->next; 463 sp_idx = space->idx; 464 for (l = 0; l < space->local_used; l++) { 465 /* if indices are NOT locally sorted, need to start search at the beginning */ 466 if (lastidx > (idx = sp_idx[l])) j = 0; 467 lastidx = idx; 468 for (; j < size; j++) { 469 if (idx >= owners[j] && idx < owners[j + 1]) { 470 nlengths[j]++; 471 owner[i] = j; 472 break; 473 } 474 } 475 i++; 476 } 477 space = space_next; 478 } 479 480 /* Now check what procs get messages - and compute nsends. */ 481 PetscCall(PetscCalloc1(size, &sizes)); 482 for (i = 0, nsends = 0; i < size; i++) { 483 if (nlengths[i]) { 484 sizes[i] = 1; 485 nsends++; 486 } 487 } 488 489 { 490 PetscMPIInt *onodes, *olengths; 491 /* Determine the number of messages to expect, their lengths, from from-ids */ 492 PetscCall(PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives)); 493 PetscCall(PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths)); 494 /* since clubbing row,col - lengths are multiplied by 2 */ 495 for (i = 0; i < nreceives; i++) olengths[i] *= 2; 496 PetscCall(PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1)); 497 /* values are size 'bs2' lengths (and remove earlier factor 2 */ 498 for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2; 499 PetscCall(PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2)); 500 PetscCall(PetscFree(onodes)); 501 PetscCall(PetscFree(olengths)); 502 } 503 504 /* do sends: 505 1) starts[i] gives the starting index in svalues for stuff going to 506 the ith processor 507 */ 508 PetscCall(PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices)); 509 PetscCall(PetscMalloc1(2 * nsends, &send_waits)); 510 PetscCall(PetscMalloc2(size, &startv, size, &starti)); 511 /* use 2 sends the first with all_a, the next with all_i and all_j */ 512 startv[0] = 0; 513 starti[0] = 0; 514 for (i = 1; i < size; i++) { 515 startv[i] = startv[i - 1] + nlengths[i - 1]; 516 starti[i] = starti[i - 1] + 2 * nlengths[i - 1]; 517 } 518 519 i = 0; 520 space = stash->space_head; 521 while (space) { 522 space_next = space->next; 523 sp_idx = space->idx; 524 sp_idy = space->idy; 525 sp_val = space->val; 526 for (l = 0; l < space->local_used; l++) { 527 j = owner[i]; 528 if (bs2 == 1) { 529 svalues[startv[j]] = sp_val[l]; 530 } else { 531 PetscInt k; 532 PetscScalar *buf1, *buf2; 533 buf1 = svalues + bs2 * startv[j]; 534 buf2 = space->val + bs2 * l; 535 for (k = 0; k < bs2; k++) buf1[k] = buf2[k]; 536 } 537 sindices[starti[j]] = sp_idx[l]; 538 sindices[starti[j] + nlengths[j]] = sp_idy[l]; 539 startv[j]++; 540 starti[j]++; 541 i++; 542 } 543 space = space_next; 544 } 545 startv[0] = 0; 546 for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1]; 547 548 for (i = 0, count = 0; i < size; i++) { 549 if (sizes[i]) { 550 PetscCallMPI(MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++)); 551 PetscCallMPI(MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++)); 552 } 553 } 554 #if defined(PETSC_USE_INFO) 555 PetscCall(PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends)); 556 for (i = 0; i < size; i++) { 557 if (sizes[i]) PetscCall(PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt))))); 558 } 559 #endif 560 PetscCall(PetscFree(nlengths)); 561 PetscCall(PetscFree(owner)); 562 PetscCall(PetscFree2(startv, starti)); 563 PetscCall(PetscFree(sizes)); 564 565 /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */ 566 PetscCall(PetscMalloc1(2 * nreceives, &recv_waits)); 567 568 for (i = 0; i < nreceives; i++) { 569 recv_waits[2 * i] = recv_waits1[i]; 570 recv_waits[2 * i + 1] = recv_waits2[i]; 571 } 572 stash->recv_waits = recv_waits; 573 574 PetscCall(PetscFree(recv_waits1)); 575 PetscCall(PetscFree(recv_waits2)); 576 577 stash->svalues = svalues; 578 stash->sindices = sindices; 579 stash->rvalues = rvalues; 580 stash->rindices = rindices; 581 stash->send_waits = send_waits; 582 stash->nsends = nsends; 583 stash->nrecvs = nreceives; 584 stash->reproduce_count = 0; 585 PetscFunctionReturn(0); 586 } 587 588 /* 589 MatStashScatterGetMesg_Private - This function waits on the receives posted 590 in the function MatStashScatterBegin_Private() and returns one message at 591 a time to the calling function. If no messages are left, it indicates this 592 by setting flg = 0, else it sets flg = 1. 593 594 Input Parameters: 595 stash - the stash 596 597 Output Parameters: 598 nvals - the number of entries in the current message. 599 rows - an array of row indices (or blocked indices) corresponding to the values 600 cols - an array of columnindices (or blocked indices) corresponding to the values 601 vals - the values 602 flg - 0 indicates no more message left, and the current call has no values associated. 603 1 indicates that the current call successfully received a message, and the 604 other output parameters nvals,rows,cols,vals are set appropriately. 605 */ 606 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) { 607 PetscFunctionBegin; 608 PetscCall((*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg)); 609 PetscFunctionReturn(0); 610 } 611 612 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) { 613 PetscMPIInt i, *flg_v = stash->flg_v, i1, i2; 614 PetscInt bs2; 615 MPI_Status recv_status; 616 PetscBool match_found = PETSC_FALSE; 617 618 PetscFunctionBegin; 619 *flg = 0; /* When a message is discovered this is reset to 1 */ 620 /* Return if no more messages to process */ 621 if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0); 622 623 bs2 = stash->bs * stash->bs; 624 /* If a matching pair of receives are found, process them, and return the data to 625 the calling function. Until then keep receiving messages */ 626 while (!match_found) { 627 if (stash->reproduce) { 628 i = stash->reproduce_count++; 629 PetscCallMPI(MPI_Wait(stash->recv_waits + i, &recv_status)); 630 } else { 631 PetscCallMPI(MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status)); 632 } 633 PetscCheck(recv_status.MPI_SOURCE >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Negative MPI source!"); 634 635 /* Now pack the received message into a structure which is usable by others */ 636 if (i % 2) { 637 PetscCallMPI(MPI_Get_count(&recv_status, MPIU_SCALAR, nvals)); 638 flg_v[2 * recv_status.MPI_SOURCE] = i / 2; 639 *nvals = *nvals / bs2; 640 } else { 641 PetscCallMPI(MPI_Get_count(&recv_status, MPIU_INT, nvals)); 642 flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2; 643 *nvals = *nvals / 2; /* This message has both row indices and col indices */ 644 } 645 646 /* Check if we have both messages from this proc */ 647 i1 = flg_v[2 * recv_status.MPI_SOURCE]; 648 i2 = flg_v[2 * recv_status.MPI_SOURCE + 1]; 649 if (i1 != -1 && i2 != -1) { 650 *rows = stash->rindices[i2]; 651 *cols = *rows + *nvals; 652 *vals = stash->rvalues[i1]; 653 *flg = 1; 654 stash->nprocessed++; 655 match_found = PETSC_TRUE; 656 } 657 } 658 PetscFunctionReturn(0); 659 } 660 661 #if !defined(PETSC_HAVE_MPIUNI) 662 typedef struct { 663 PetscInt row; 664 PetscInt col; 665 PetscScalar vals[1]; /* Actually an array of length bs2 */ 666 } MatStashBlock; 667 668 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode) { 669 PetscMatStashSpace space; 670 PetscInt n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i; 671 PetscScalar **valptr; 672 673 PetscFunctionBegin; 674 PetscCall(PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm)); 675 for (space = stash->space_head, cnt = 0; space; space = space->next) { 676 for (i = 0; i < space->local_used; i++) { 677 row[cnt] = space->idx[i]; 678 col[cnt] = space->idy[i]; 679 valptr[cnt] = &space->val[i * bs2]; 680 perm[cnt] = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */ 681 cnt++; 682 } 683 } 684 PetscCheck(cnt == n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries", n, cnt); 685 PetscCall(PetscSortIntWithArrayPair(n, row, col, perm)); 686 /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */ 687 for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) { 688 if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */ 689 PetscInt colstart; 690 PetscCall(PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart])); 691 for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */ 692 PetscInt j, l; 693 MatStashBlock *block; 694 PetscCall(PetscSegBufferGet(stash->segsendblocks, 1, &block)); 695 block->row = row[rowstart]; 696 block->col = col[colstart]; 697 PetscCall(PetscArraycpy(block->vals, valptr[perm[colstart]], bs2)); 698 for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */ 699 if (insertmode == ADD_VALUES) { 700 for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l]; 701 } else { 702 PetscCall(PetscArraycpy(block->vals, valptr[perm[j]], bs2)); 703 } 704 } 705 colstart = j; 706 } 707 rowstart = i; 708 } 709 } 710 PetscCall(PetscFree4(row, col, valptr, perm)); 711 PetscFunctionReturn(0); 712 } 713 714 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) { 715 PetscFunctionBegin; 716 if (stash->blocktype == MPI_DATATYPE_NULL) { 717 PetscInt bs2 = PetscSqr(stash->bs); 718 PetscMPIInt blocklens[2]; 719 MPI_Aint displs[2]; 720 MPI_Datatype types[2], stype; 721 /* 722 DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex. 723 std::complex itself has standard layout, so does DummyBlock, recursively. 724 To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout, 725 though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++. 726 offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof. 727 728 We can test if std::complex has standard layout with the following code: 729 #include <iostream> 730 #include <type_traits> 731 #include <complex> 732 int main() { 733 std::cout << std::boolalpha; 734 std::cout << std::is_standard_layout<std::complex<double> >::value << '\n'; 735 } 736 Output: true 737 */ 738 struct DummyBlock { 739 PetscInt row, col; 740 PetscScalar vals; 741 }; 742 743 stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar); 744 if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */ 745 stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt); 746 } 747 PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks)); 748 PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks)); 749 PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe)); 750 blocklens[0] = 2; 751 blocklens[1] = bs2; 752 displs[0] = offsetof(struct DummyBlock, row); 753 displs[1] = offsetof(struct DummyBlock, vals); 754 types[0] = MPIU_INT; 755 types[1] = MPIU_SCALAR; 756 PetscCallMPI(MPI_Type_create_struct(2, blocklens, displs, types, &stype)); 757 PetscCallMPI(MPI_Type_commit(&stype)); 758 PetscCallMPI(MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype)); 759 PetscCallMPI(MPI_Type_commit(&stash->blocktype)); 760 PetscCallMPI(MPI_Type_free(&stype)); 761 } 762 PetscFunctionReturn(0); 763 } 764 765 /* Callback invoked after target rank has initiatied receive of rendezvous message. 766 * Here we post the main sends. 767 */ 768 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx) { 769 MatStash *stash = (MatStash *)ctx; 770 MatStashHeader *hdr = (MatStashHeader *)sdata; 771 772 PetscFunctionBegin; 773 PetscCheck(rank == stash->sendranks[rankid], comm, PETSC_ERR_PLIB, "BTS Send rank %d does not match sendranks[%d] %d", rank, rankid, stash->sendranks[rankid]); 774 PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0])); 775 stash->sendframes[rankid].count = hdr->count; 776 stash->sendframes[rankid].pending = 1; 777 PetscFunctionReturn(0); 778 } 779 780 /* 781 Callback invoked by target after receiving rendezvous message. 782 Here we post the main recvs. 783 */ 784 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx) { 785 MatStash *stash = (MatStash *)ctx; 786 MatStashHeader *hdr = (MatStashHeader *)rdata; 787 MatStashFrame *frame; 788 789 PetscFunctionBegin; 790 PetscCall(PetscSegBufferGet(stash->segrecvframe, 1, &frame)); 791 PetscCall(PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer)); 792 PetscCallMPI(MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0])); 793 frame->count = hdr->count; 794 frame->pending = 1; 795 PetscFunctionReturn(0); 796 } 797 798 /* 799 * owners[] contains the ownership ranges; may be indexed by either blocks or scalars 800 */ 801 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[]) { 802 size_t nblocks; 803 char *sendblocks; 804 805 PetscFunctionBegin; 806 if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */ 807 InsertMode addv; 808 PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat))); 809 PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added"); 810 } 811 812 PetscCall(MatStashBlockTypeSetUp(stash)); 813 PetscCall(MatStashSortCompress_Private(stash, mat->insertmode)); 814 PetscCall(PetscSegBufferGetSize(stash->segsendblocks, &nblocks)); 815 PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks)); 816 if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */ 817 PetscInt i; 818 size_t b; 819 for (i = 0, b = 0; i < stash->nsendranks; i++) { 820 stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size]; 821 /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */ 822 stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */ 823 for (; b < nblocks; b++) { 824 MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size]; 825 PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]], stash->comm, PETSC_ERR_ARG_WRONG, "MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly", sendblock_b->row, stash->sendranks[i]); 826 if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break; 827 stash->sendhdr[i].count++; 828 } 829 } 830 } else { /* Dynamically count and pack (first time) */ 831 PetscInt sendno; 832 size_t i, rowstart; 833 834 /* Count number of send ranks and allocate for sends */ 835 stash->nsendranks = 0; 836 for (rowstart = 0; rowstart < nblocks;) { 837 PetscInt owner; 838 MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size]; 839 PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner)); 840 if (owner < 0) owner = -(owner + 2); 841 for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 842 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 843 if (sendblock_i->row >= owners[owner + 1]) break; 844 } 845 stash->nsendranks++; 846 rowstart = i; 847 } 848 PetscCall(PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes)); 849 850 /* Set up sendhdrs and sendframes */ 851 sendno = 0; 852 for (rowstart = 0; rowstart < nblocks;) { 853 PetscInt owner; 854 MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size]; 855 PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner)); 856 if (owner < 0) owner = -(owner + 2); 857 stash->sendranks[sendno] = owner; 858 for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */ 859 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 860 if (sendblock_i->row >= owners[owner + 1]) break; 861 } 862 stash->sendframes[sendno].buffer = sendblock_rowstart; 863 stash->sendframes[sendno].pending = 0; 864 stash->sendhdr[sendno].count = i - rowstart; 865 sendno++; 866 rowstart = i; 867 } 868 PetscCheck(sendno == stash->nsendranks, stash->comm, PETSC_ERR_PLIB, "BTS counted %d sendranks, but %" PetscInt_FMT " sends", stash->nsendranks, sendno); 869 } 870 871 /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new 872 * message or a dummy entry of some sort. */ 873 if (mat->insertmode == INSERT_VALUES) { 874 size_t i; 875 for (i = 0; i < nblocks; i++) { 876 MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size]; 877 sendblock_i->row = -(sendblock_i->row + 1); 878 } 879 } 880 881 if (stash->first_assembly_done) { 882 PetscMPIInt i, tag; 883 PetscCall(PetscCommGetNewTag(stash->comm, &tag)); 884 for (i = 0; i < stash->nrecvranks; i++) PetscCall(MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash)); 885 for (i = 0; i < stash->nsendranks; i++) PetscCall(MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash)); 886 stash->use_status = PETSC_TRUE; /* Use count from message status. */ 887 } else { 888 PetscCall(PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash)); 889 PetscCall(PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses)); 890 stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */ 891 } 892 893 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes)); 894 stash->recvframe_active = NULL; 895 stash->recvframe_i = 0; 896 stash->some_i = 0; 897 stash->some_count = 0; 898 stash->recvcount = 0; 899 stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */ 900 stash->insertmode = &mat->insertmode; 901 PetscFunctionReturn(0); 902 } 903 904 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg) { 905 MatStashBlock *block; 906 907 PetscFunctionBegin; 908 *flg = 0; 909 while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) { 910 if (stash->some_i == stash->some_count) { 911 if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */ 912 PetscCallMPI(MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE)); 913 stash->some_i = 0; 914 } 915 stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]]; 916 stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */ 917 if (stash->use_status) { /* Count what was actually sent */ 918 PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count)); 919 } 920 if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */ 921 block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0]; 922 if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES; 923 PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling INSERT_VALUES, but rank %d requested ADD_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]); 924 PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling ADD_VALUES, but rank %d requested INSERT_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]); 925 } 926 stash->some_i++; 927 stash->recvcount++; 928 stash->recvframe_i = 0; 929 } 930 *n = 1; 931 block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size]; 932 if (block->row < 0) block->row = -(block->row + 1); 933 *row = &block->row; 934 *col = &block->col; 935 *val = block->vals; 936 stash->recvframe_i++; 937 *flg = 1; 938 PetscFunctionReturn(0); 939 } 940 941 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash) { 942 PetscFunctionBegin; 943 PetscCallMPI(MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE)); 944 if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks */ 945 PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL)); 946 } else { /* No reuse, so collect everything. */ 947 PetscCall(MatStashScatterDestroy_BTS(stash)); 948 } 949 950 /* Now update nmaxold to be app 10% more than max n used, this way the 951 wastage of space is reduced the next time this stash is used. 952 Also update the oldmax, only if it increases */ 953 if (stash->n) { 954 PetscInt bs2 = stash->bs * stash->bs; 955 PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2; 956 if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax; 957 } 958 959 stash->nmax = 0; 960 stash->n = 0; 961 stash->reallocs = -1; 962 stash->nprocessed = 0; 963 964 PetscCall(PetscMatStashSpaceDestroy(&stash->space_head)); 965 966 stash->space = NULL; 967 968 PetscFunctionReturn(0); 969 } 970 971 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash) { 972 PetscFunctionBegin; 973 PetscCall(PetscSegBufferDestroy(&stash->segsendblocks)); 974 PetscCall(PetscSegBufferDestroy(&stash->segrecvframe)); 975 stash->recvframes = NULL; 976 PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks)); 977 if (stash->blocktype != MPI_DATATYPE_NULL) PetscCallMPI(MPI_Type_free(&stash->blocktype)); 978 stash->nsendranks = 0; 979 stash->nrecvranks = 0; 980 PetscCall(PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes)); 981 PetscCall(PetscFree(stash->sendreqs)); 982 PetscCall(PetscFree(stash->recvreqs)); 983 PetscCall(PetscFree(stash->recvranks)); 984 PetscCall(PetscFree(stash->recvhdr)); 985 PetscCall(PetscFree2(stash->some_indices, stash->some_statuses)); 986 PetscFunctionReturn(0); 987 } 988 #endif 989