xref: /petsc/src/mat/utils/matstash.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE 10000
5 
6 static PetscErrorCode       MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameter:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash)
30 {
31   PetscInt  max, *opt, nopt, i;
32   PetscBool flg;
33 
34   PetscFunctionBegin;
35   /* Require 2 tags,get the second using PetscCommGetNewTag() */
36   stash->comm = comm;
37 
38   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag1));
39   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag2));
40   PetscCallMPI(MPI_Comm_size(stash->comm, &stash->size));
41   PetscCallMPI(MPI_Comm_rank(stash->comm, &stash->rank));
42   PetscCall(PetscMalloc1(2 * stash->size, &stash->flg_v));
43   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
44 
45   nopt = stash->size;
46   PetscCall(PetscMalloc1(nopt, &opt));
47   PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg));
48   if (flg) {
49     if (nopt == 1) max = opt[0];
50     else if (nopt == stash->size) max = opt[stash->rank];
51     else if (stash->rank < nopt) max = opt[stash->rank];
52     else max = 0; /* Use default */
53     stash->umax = max;
54   } else {
55     stash->umax = 0;
56   }
57   PetscCall(PetscFree(opt));
58   if (bs <= 0) bs = 1;
59 
60   stash->bs         = bs;
61   stash->nmax       = 0;
62   stash->oldnmax    = 0;
63   stash->n          = 0;
64   stash->reallocs   = -1;
65   stash->space_head = NULL;
66   stash->space      = NULL;
67 
68   stash->send_waits  = NULL;
69   stash->recv_waits  = NULL;
70   stash->send_status = NULL;
71   stash->nsends      = 0;
72   stash->nrecvs      = 0;
73   stash->svalues     = NULL;
74   stash->rvalues     = NULL;
75   stash->rindices    = NULL;
76   stash->nprocessed  = 0;
77   stash->reproduce   = PETSC_FALSE;
78   stash->blocktype   = MPI_DATATYPE_NULL;
79 
80   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL));
81 #if !defined(PETSC_HAVE_MPIUNI)
82   flg = PETSC_FALSE;
83   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL));
84   if (!flg) {
85     stash->ScatterBegin   = MatStashScatterBegin_BTS;
86     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
87     stash->ScatterEnd     = MatStashScatterEnd_BTS;
88     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
89   } else {
90 #endif
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95 #if !defined(PETSC_HAVE_MPIUNI)
96   }
97 #endif
98   PetscFunctionReturn(PETSC_SUCCESS);
99 }
100 
101 /*
102    MatStashDestroy_Private - Destroy the stash
103 */
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscFunctionBegin;
107   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
108   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
109   stash->space = NULL;
110   PetscCall(PetscFree(stash->flg_v));
111   PetscFunctionReturn(PETSC_SUCCESS);
112 }
113 
114 /*
115    MatStashScatterEnd_Private - This is called as the final stage of
116    scatter. The final stages of message passing is done here, and
117    all the memory used for message passing is cleaned up. This
118    routine also resets the stash, and deallocates the memory used
119    for the stash. It also keeps track of the current memory usage
120    so that the same value can be used the next time through.
121 */
122 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
123 {
124   PetscFunctionBegin;
125   PetscCall((*stash->ScatterEnd)(stash));
126   PetscFunctionReturn(PETSC_SUCCESS);
127 }
128 
129 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
130 {
131   PetscInt    nsends = stash->nsends, bs2, oldnmax, i;
132   MPI_Status *send_status;
133 
134   PetscFunctionBegin;
135   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
136   /* wait on sends */
137   if (nsends) {
138     PetscCall(PetscMalloc1(2 * nsends, &send_status));
139     PetscCallMPI(MPI_Waitall(2 * nsends, stash->send_waits, send_status));
140     PetscCall(PetscFree(send_status));
141   }
142 
143   /* Now update nmaxold to be app 10% more than max n used, this way the
144      wastage of space is reduced the next time this stash is used.
145      Also update the oldmax, only if it increases */
146   if (stash->n) {
147     bs2     = stash->bs * stash->bs;
148     oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
149     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
150   }
151 
152   stash->nmax       = 0;
153   stash->n          = 0;
154   stash->reallocs   = -1;
155   stash->nprocessed = 0;
156 
157   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
158 
159   stash->space = NULL;
160 
161   PetscCall(PetscFree(stash->send_waits));
162   PetscCall(PetscFree(stash->recv_waits));
163   PetscCall(PetscFree2(stash->svalues, stash->sindices));
164   PetscCall(PetscFree(stash->rvalues[0]));
165   PetscCall(PetscFree(stash->rvalues));
166   PetscCall(PetscFree(stash->rindices[0]));
167   PetscCall(PetscFree(stash->rindices));
168   PetscFunctionReturn(PETSC_SUCCESS);
169 }
170 
171 /*
172    MatStashGetInfo_Private - Gets the relevant statistics of the stash
173 
174    Input Parameters:
175    stash    - the stash
176    nstash   - the size of the stash. Indicates the number of values stored.
177    reallocs - the number of additional mallocs incurred.
178 
179 */
180 PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs)
181 {
182   PetscInt bs2 = stash->bs * stash->bs;
183 
184   PetscFunctionBegin;
185   if (nstash) *nstash = stash->n * bs2;
186   if (reallocs) {
187     if (stash->reallocs < 0) *reallocs = 0;
188     else *reallocs = stash->reallocs;
189   }
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 /*
194    MatStashSetInitialSize_Private - Sets the initial size of the stash
195 
196    Input Parameters:
197    stash  - the stash
198    max    - the value that is used as the max size of the stash.
199             this value is used while allocating memory.
200 */
201 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max)
202 {
203   PetscFunctionBegin;
204   stash->umax = max;
205   PetscFunctionReturn(PETSC_SUCCESS);
206 }
207 
208 /* MatStashExpand_Private - Expand the stash. This function is called
209    when the space in the stash is not sufficient to add the new values
210    being inserted into the stash.
211 
212    Input Parameters:
213    stash - the stash
214    incr  - the minimum increase requested
215 
216    Note:
217    This routine doubles the currently used memory.
218  */
219 static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr)
220 {
221   PetscInt newnmax, bs2 = stash->bs * stash->bs;
222 
223   PetscFunctionBegin;
224   /* allocate a larger stash */
225   if (!stash->oldnmax && !stash->nmax) { /* new stash */
226     if (stash->umax) newnmax = stash->umax / bs2;
227     else newnmax = DEFAULT_STASH_SIZE / bs2;
228   } else if (!stash->nmax) { /* resuing stash */
229     if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2;
230     else newnmax = stash->oldnmax / bs2;
231   } else newnmax = stash->nmax * 2;
232   if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr;
233 
234   /* Get a MatStashSpace and attach it to stash */
235   PetscCall(PetscMatStashSpaceGet(bs2, newnmax, &stash->space));
236   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
237     stash->space_head = stash->space;
238   }
239 
240   stash->reallocs++;
241   stash->nmax = newnmax;
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 /*
245   MatStashValuesRow_Private - inserts values into the stash. This function
246   expects the values to be roworiented. Multiple columns belong to the same row
247   can be inserted with a single call to this function.
248 
249   Input Parameters:
250   stash  - the stash
251   row    - the global row correspoiding to the values
252   n      - the number of elements inserted. All elements belong to the above row.
253   idxn   - the global column indices corresponding to each of the values.
254   values - the values inserted
255 */
256 PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries)
257 {
258   PetscInt           i, k, cnt = 0;
259   PetscMatStashSpace space = stash->space;
260 
261   PetscFunctionBegin;
262   /* Check and see if we have sufficient memory */
263   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
264   space = stash->space;
265   k     = space->local_used;
266   for (i = 0; i < n; i++) {
267     if (ignorezeroentries && values && values[i] == 0.0) continue;
268     space->idx[k] = row;
269     space->idy[k] = idxn[i];
270     space->val[k] = values ? values[i] : 0.0;
271     k++;
272     cnt++;
273   }
274   stash->n += cnt;
275   space->local_used += cnt;
276   space->local_remaining -= cnt;
277   PetscFunctionReturn(PETSC_SUCCESS);
278 }
279 
280 /*
281   MatStashValuesCol_Private - inserts values into the stash. This function
282   expects the values to be columnoriented. Multiple columns belong to the same row
283   can be inserted with a single call to this function.
284 
285   Input Parameters:
286   stash   - the stash
287   row     - the global row correspoiding to the values
288   n       - the number of elements inserted. All elements belong to the above row.
289   idxn    - the global column indices corresponding to each of the values.
290   values  - the values inserted
291   stepval - the consecutive values are sepated by a distance of stepval.
292             this happens because the input is columnoriented.
293 */
294 PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries)
295 {
296   PetscInt           i, k, cnt = 0;
297   PetscMatStashSpace space = stash->space;
298 
299   PetscFunctionBegin;
300   /* Check and see if we have sufficient memory */
301   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
302   space = stash->space;
303   k     = space->local_used;
304   for (i = 0; i < n; i++) {
305     if (ignorezeroentries && values && values[i * stepval] == 0.0) continue;
306     space->idx[k] = row;
307     space->idy[k] = idxn[i];
308     space->val[k] = values ? values[i * stepval] : 0.0;
309     k++;
310     cnt++;
311   }
312   stash->n += cnt;
313   space->local_used += cnt;
314   space->local_remaining -= cnt;
315   PetscFunctionReturn(PETSC_SUCCESS);
316 }
317 
318 /*
319   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
320   This function expects the values to be roworiented. Multiple columns belong
321   to the same block-row can be inserted with a single call to this function.
322   This function extracts the sub-block of values based on the dimensions of
323   the original input block, and the row,col values corresponding to the blocks.
324 
325   Input Parameters:
326   stash  - the stash
327   row    - the global block-row correspoiding to the values
328   n      - the number of elements inserted. All elements belong to the above row.
329   idxn   - the global block-column indices corresponding to each of the blocks of
330            values. Each block is of size bs*bs.
331   values - the values inserted
332   rmax   - the number of block-rows in the original block.
333   cmax   - the number of block-columns on the original block.
334   idx    - the index of the current block-row in the original block.
335 */
336 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
337 {
338   PetscInt           i, j, k, bs2, bs = stash->bs, l;
339   const PetscScalar *vals;
340   PetscScalar       *array;
341   PetscMatStashSpace space = stash->space;
342 
343   PetscFunctionBegin;
344   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
345   space = stash->space;
346   l     = space->local_used;
347   bs2   = bs * bs;
348   for (i = 0; i < n; i++) {
349     space->idx[l] = row;
350     space->idy[l] = idxn[i];
351     /* Now copy over the block of values. Store the values column oriented.
352        This enables inserting multiple blocks belonging to a row with a single
353        function call */
354     array = space->val + bs2 * l;
355     vals  = values + idx * bs2 * n + bs * i;
356     for (j = 0; j < bs; j++) {
357       for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0;
358       array++;
359       vals += cmax * bs;
360     }
361     l++;
362   }
363   stash->n += n;
364   space->local_used += n;
365   space->local_remaining -= n;
366   PetscFunctionReturn(PETSC_SUCCESS);
367 }
368 
369 /*
370   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
371   This function expects the values to be roworiented. Multiple columns belong
372   to the same block-row can be inserted with a single call to this function.
373   This function extracts the sub-block of values based on the dimensions of
374   the original input block, and the row,col values corresponding to the blocks.
375 
376   Input Parameters:
377   stash  - the stash
378   row    - the global block-row correspoiding to the values
379   n      - the number of elements inserted. All elements belong to the above row.
380   idxn   - the global block-column indices corresponding to each of the blocks of
381            values. Each block is of size bs*bs.
382   values - the values inserted
383   rmax   - the number of block-rows in the original block.
384   cmax   - the number of block-columns on the original block.
385   idx    - the index of the current block-row in the original block.
386 */
387 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
388 {
389   PetscInt           i, j, k, bs2, bs = stash->bs, l;
390   const PetscScalar *vals;
391   PetscScalar       *array;
392   PetscMatStashSpace space = stash->space;
393 
394   PetscFunctionBegin;
395   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
396   space = stash->space;
397   l     = space->local_used;
398   bs2   = bs * bs;
399   for (i = 0; i < n; i++) {
400     space->idx[l] = row;
401     space->idy[l] = idxn[i];
402     /* Now copy over the block of values. Store the values column oriented.
403      This enables inserting multiple blocks belonging to a row with a single
404      function call */
405     array = space->val + bs2 * l;
406     vals  = values + idx * bs2 * n + bs * i;
407     for (j = 0; j < bs; j++) {
408       for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0;
409       array += bs;
410       vals += rmax * bs;
411     }
412     l++;
413   }
414   stash->n += n;
415   space->local_used += n;
416   space->local_remaining -= n;
417   PetscFunctionReturn(PETSC_SUCCESS);
418 }
419 /*
420   MatStashScatterBegin_Private - Initiates the transfer of values to the
421   correct owners. This function goes through the stash, and check the
422   owners of each stashed value, and sends the values off to the owner
423   processors.
424 
425   Input Parameters:
426   stash  - the stash
427   owners - an array of size 'no-of-procs' which gives the ownership range
428            for each node.
429 
430   Note:
431     The 'owners' array in the cased of the blocked-stash has the
432   ranges specified blocked global indices, and for the regular stash in
433   the proper global indices.
434 */
435 PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners)
436 {
437   PetscFunctionBegin;
438   PetscCall((*stash->ScatterBegin)(mat, stash, owners));
439   PetscFunctionReturn(PETSC_SUCCESS);
440 }
441 
442 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners)
443 {
444   PetscInt          *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2;
445   PetscInt           size = stash->size, nsends;
446   PetscInt           count, *sindices, **rindices, i, j, idx, lastidx, l;
447   PetscScalar      **rvalues, *svalues;
448   MPI_Comm           comm = stash->comm;
449   MPI_Request       *send_waits, *recv_waits, *recv_waits1, *recv_waits2;
450   PetscMPIInt       *sizes, *nlengths, nreceives;
451   PetscInt          *sp_idx, *sp_idy;
452   PetscScalar       *sp_val;
453   PetscMatStashSpace space, space_next;
454 
455   PetscFunctionBegin;
456   { /* make sure all processors are either in INSERTMODE or ADDMODE */
457     InsertMode addv;
458     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
459     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
460     mat->insertmode = addv; /* in case this processor had no cache */
461   }
462 
463   bs2 = stash->bs * stash->bs;
464 
465   /*  first count number of contributors to each processor */
466   PetscCall(PetscCalloc1(size, &nlengths));
467   PetscCall(PetscMalloc1(stash->n + 1, &owner));
468 
469   i = j   = 0;
470   lastidx = -1;
471   space   = stash->space_head;
472   while (space) {
473     space_next = space->next;
474     sp_idx     = space->idx;
475     for (l = 0; l < space->local_used; l++) {
476       /* if indices are NOT locally sorted, need to start search at the beginning */
477       if (lastidx > (idx = sp_idx[l])) j = 0;
478       lastidx = idx;
479       for (; j < size; j++) {
480         if (idx >= owners[j] && idx < owners[j + 1]) {
481           nlengths[j]++;
482           owner[i] = j;
483           break;
484         }
485       }
486       i++;
487     }
488     space = space_next;
489   }
490 
491   /* Now check what procs get messages - and compute nsends. */
492   PetscCall(PetscCalloc1(size, &sizes));
493   for (i = 0, nsends = 0; i < size; i++) {
494     if (nlengths[i]) {
495       sizes[i] = 1;
496       nsends++;
497     }
498   }
499 
500   {
501     PetscMPIInt *onodes, *olengths;
502     /* Determine the number of messages to expect, their lengths, from from-ids */
503     PetscCall(PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives));
504     PetscCall(PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths));
505     /* since clubbing row,col - lengths are multiplied by 2 */
506     for (i = 0; i < nreceives; i++) olengths[i] *= 2;
507     PetscCall(PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1));
508     /* values are size 'bs2' lengths (and remove earlier factor 2 */
509     for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2;
510     PetscCall(PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2));
511     PetscCall(PetscFree(onodes));
512     PetscCall(PetscFree(olengths));
513   }
514 
515   /* do sends:
516       1) starts[i] gives the starting index in svalues for stuff going to
517          the ith processor
518   */
519   PetscCall(PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices));
520   PetscCall(PetscMalloc1(2 * nsends, &send_waits));
521   PetscCall(PetscMalloc2(size, &startv, size, &starti));
522   /* use 2 sends the first with all_a, the next with all_i and all_j */
523   startv[0] = 0;
524   starti[0] = 0;
525   for (i = 1; i < size; i++) {
526     startv[i] = startv[i - 1] + nlengths[i - 1];
527     starti[i] = starti[i - 1] + 2 * nlengths[i - 1];
528   }
529 
530   i     = 0;
531   space = stash->space_head;
532   while (space) {
533     space_next = space->next;
534     sp_idx     = space->idx;
535     sp_idy     = space->idy;
536     sp_val     = space->val;
537     for (l = 0; l < space->local_used; l++) {
538       j = owner[i];
539       if (bs2 == 1) {
540         svalues[startv[j]] = sp_val[l];
541       } else {
542         PetscInt     k;
543         PetscScalar *buf1, *buf2;
544         buf1 = svalues + bs2 * startv[j];
545         buf2 = space->val + bs2 * l;
546         for (k = 0; k < bs2; k++) buf1[k] = buf2[k];
547       }
548       sindices[starti[j]]               = sp_idx[l];
549       sindices[starti[j] + nlengths[j]] = sp_idy[l];
550       startv[j]++;
551       starti[j]++;
552       i++;
553     }
554     space = space_next;
555   }
556   startv[0] = 0;
557   for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1];
558 
559   for (i = 0, count = 0; i < size; i++) {
560     if (sizes[i]) {
561       PetscCallMPI(MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++));
562       PetscCallMPI(MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++));
563     }
564   }
565 #if defined(PETSC_USE_INFO)
566   PetscCall(PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends));
567   for (i = 0; i < size; i++) {
568     if (sizes[i]) PetscCall(PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt)))));
569   }
570 #endif
571   PetscCall(PetscFree(nlengths));
572   PetscCall(PetscFree(owner));
573   PetscCall(PetscFree2(startv, starti));
574   PetscCall(PetscFree(sizes));
575 
576   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
577   PetscCall(PetscMalloc1(2 * nreceives, &recv_waits));
578 
579   for (i = 0; i < nreceives; i++) {
580     recv_waits[2 * i]     = recv_waits1[i];
581     recv_waits[2 * i + 1] = recv_waits2[i];
582   }
583   stash->recv_waits = recv_waits;
584 
585   PetscCall(PetscFree(recv_waits1));
586   PetscCall(PetscFree(recv_waits2));
587 
588   stash->svalues         = svalues;
589   stash->sindices        = sindices;
590   stash->rvalues         = rvalues;
591   stash->rindices        = rindices;
592   stash->send_waits      = send_waits;
593   stash->nsends          = nsends;
594   stash->nrecvs          = nreceives;
595   stash->reproduce_count = 0;
596   PetscFunctionReturn(PETSC_SUCCESS);
597 }
598 
599 /*
600    MatStashScatterGetMesg_Private - This function waits on the receives posted
601    in the function MatStashScatterBegin_Private() and returns one message at
602    a time to the calling function. If no messages are left, it indicates this
603    by setting flg = 0, else it sets flg = 1.
604 
605    Input Parameter:
606    stash - the stash
607 
608    Output Parameters:
609    nvals - the number of entries in the current message.
610    rows  - an array of row indices (or blocked indices) corresponding to the values
611    cols  - an array of columnindices (or blocked indices) corresponding to the values
612    vals  - the values
613    flg   - 0 indicates no more message left, and the current call has no values associated.
614            1 indicates that the current call successfully received a message, and the
615              other output parameters nvals,rows,cols,vals are set appropriately.
616 */
617 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
618 {
619   PetscFunctionBegin;
620   PetscCall((*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg));
621   PetscFunctionReturn(PETSC_SUCCESS);
622 }
623 
624 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
625 {
626   PetscMPIInt i, *flg_v = stash->flg_v, i1, i2;
627   PetscInt    bs2;
628   MPI_Status  recv_status;
629   PetscBool   match_found = PETSC_FALSE;
630 
631   PetscFunctionBegin;
632   *flg = 0; /* When a message is discovered this is reset to 1 */
633   /* Return if no more messages to process */
634   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(PETSC_SUCCESS);
635 
636   bs2 = stash->bs * stash->bs;
637   /* If a matching pair of receives are found, process them, and return the data to
638      the calling function. Until then keep receiving messages */
639   while (!match_found) {
640     if (stash->reproduce) {
641       i = stash->reproduce_count++;
642       PetscCallMPI(MPI_Wait(stash->recv_waits + i, &recv_status));
643     } else {
644       PetscCallMPI(MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status));
645     }
646     PetscCheck(recv_status.MPI_SOURCE >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Negative MPI source!");
647 
648     /* Now pack the received message into a structure which is usable by others */
649     if (i % 2) {
650       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_SCALAR, nvals));
651       flg_v[2 * recv_status.MPI_SOURCE] = i / 2;
652       *nvals                            = *nvals / bs2;
653     } else {
654       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_INT, nvals));
655       flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2;
656       *nvals                                = *nvals / 2; /* This message has both row indices and col indices */
657     }
658 
659     /* Check if we have both messages from this proc */
660     i1 = flg_v[2 * recv_status.MPI_SOURCE];
661     i2 = flg_v[2 * recv_status.MPI_SOURCE + 1];
662     if (i1 != -1 && i2 != -1) {
663       *rows = stash->rindices[i2];
664       *cols = *rows + *nvals;
665       *vals = stash->rvalues[i1];
666       *flg  = 1;
667       stash->nprocessed++;
668       match_found = PETSC_TRUE;
669     }
670   }
671   PetscFunctionReturn(PETSC_SUCCESS);
672 }
673 
674 #if !defined(PETSC_HAVE_MPIUNI)
675 typedef struct {
676   PetscInt    row;
677   PetscInt    col;
678   PetscScalar vals[1]; /* Actually an array of length bs2 */
679 } MatStashBlock;
680 
681 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode)
682 {
683   PetscMatStashSpace space;
684   PetscInt           n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i;
685   PetscScalar      **valptr;
686 
687   PetscFunctionBegin;
688   PetscCall(PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm));
689   for (space = stash->space_head, cnt = 0; space; space = space->next) {
690     for (i = 0; i < space->local_used; i++) {
691       row[cnt]    = space->idx[i];
692       col[cnt]    = space->idy[i];
693       valptr[cnt] = &space->val[i * bs2];
694       perm[cnt]   = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */
695       cnt++;
696     }
697   }
698   PetscCheck(cnt == n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries", n, cnt);
699   PetscCall(PetscSortIntWithArrayPair(n, row, col, perm));
700   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
701   for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) {
702     if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */
703       PetscInt colstart;
704       PetscCall(PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart]));
705       for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */
706         PetscInt       j, l;
707         MatStashBlock *block;
708         PetscCall(PetscSegBufferGet(stash->segsendblocks, 1, &block));
709         block->row = row[rowstart];
710         block->col = col[colstart];
711         PetscCall(PetscArraycpy(block->vals, valptr[perm[colstart]], bs2));
712         for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
713           if (insertmode == ADD_VALUES) {
714             for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l];
715           } else {
716             PetscCall(PetscArraycpy(block->vals, valptr[perm[j]], bs2));
717           }
718         }
719         colstart = j;
720       }
721       rowstart = i;
722     }
723   }
724   PetscCall(PetscFree4(row, col, valptr, perm));
725   PetscFunctionReturn(PETSC_SUCCESS);
726 }
727 
728 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
729 {
730   PetscFunctionBegin;
731   if (stash->blocktype == MPI_DATATYPE_NULL) {
732     PetscInt     bs2 = PetscSqr(stash->bs);
733     PetscMPIInt  blocklens[2];
734     MPI_Aint     displs[2];
735     MPI_Datatype types[2], stype;
736     /*
737         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
738        std::complex itself has standard layout, so does DummyBlock, recursively.
739        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
740        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
741        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
742 
743        We can test if std::complex has standard layout with the following code:
744        #include <iostream>
745        #include <type_traits>
746        #include <complex>
747        int main() {
748          std::cout << std::boolalpha;
749          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
750        }
751        Output: true
752      */
753     struct DummyBlock {
754       PetscInt    row, col;
755       PetscScalar vals;
756     };
757 
758     stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar);
759     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
760       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
761     }
762     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks));
763     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks));
764     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe));
765     blocklens[0] = 2;
766     blocklens[1] = bs2;
767     displs[0]    = offsetof(struct DummyBlock, row);
768     displs[1]    = offsetof(struct DummyBlock, vals);
769     types[0]     = MPIU_INT;
770     types[1]     = MPIU_SCALAR;
771     PetscCallMPI(MPI_Type_create_struct(2, blocklens, displs, types, &stype));
772     PetscCallMPI(MPI_Type_commit(&stype));
773     PetscCallMPI(MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype));
774     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
775     PetscCallMPI(MPI_Type_free(&stype));
776   }
777   PetscFunctionReturn(PETSC_SUCCESS);
778 }
779 
780 /* Callback invoked after target rank has initiated receive of rendezvous message.
781  * Here we post the main sends.
782  */
783 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx)
784 {
785   MatStash       *stash = (MatStash *)ctx;
786   MatStashHeader *hdr   = (MatStashHeader *)sdata;
787 
788   PetscFunctionBegin;
789   PetscCheck(rank == stash->sendranks[rankid], comm, PETSC_ERR_PLIB, "BTS Send rank %d does not match sendranks[%d] %d", rank, rankid, stash->sendranks[rankid]);
790   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
791   stash->sendframes[rankid].count   = hdr->count;
792   stash->sendframes[rankid].pending = 1;
793   PetscFunctionReturn(PETSC_SUCCESS);
794 }
795 
796 /*
797     Callback invoked by target after receiving rendezvous message.
798     Here we post the main recvs.
799  */
800 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx)
801 {
802   MatStash       *stash = (MatStash *)ctx;
803   MatStashHeader *hdr   = (MatStashHeader *)rdata;
804   MatStashFrame  *frame;
805 
806   PetscFunctionBegin;
807   PetscCall(PetscSegBufferGet(stash->segrecvframe, 1, &frame));
808   PetscCall(PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer));
809   PetscCallMPI(MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
810   frame->count   = hdr->count;
811   frame->pending = 1;
812   PetscFunctionReturn(PETSC_SUCCESS);
813 }
814 
815 /*
816  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
817  */
818 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[])
819 {
820   size_t nblocks;
821   char  *sendblocks;
822 
823   PetscFunctionBegin;
824   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
825     InsertMode addv;
826     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
827     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
828   }
829 
830   PetscCall(MatStashBlockTypeSetUp(stash));
831   PetscCall(MatStashSortCompress_Private(stash, mat->insertmode));
832   PetscCall(PetscSegBufferGetSize(stash->segsendblocks, &nblocks));
833   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks));
834   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
835     PetscInt i;
836     size_t   b;
837     for (i = 0, b = 0; i < stash->nsendranks; i++) {
838       stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size];
839       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
840       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
841       for (; b < nblocks; b++) {
842         MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size];
843         PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]], stash->comm, PETSC_ERR_ARG_WRONG, "MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly", sendblock_b->row, stash->sendranks[i]);
844         if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break;
845         stash->sendhdr[i].count++;
846       }
847     }
848   } else { /* Dynamically count and pack (first time) */
849     PetscInt sendno;
850     size_t   i, rowstart;
851 
852     /* Count number of send ranks and allocate for sends */
853     stash->nsendranks = 0;
854     for (rowstart = 0; rowstart < nblocks;) {
855       PetscInt       owner;
856       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
857       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
858       if (owner < 0) owner = -(owner + 2);
859       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
860         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
861         if (sendblock_i->row >= owners[owner + 1]) break;
862       }
863       stash->nsendranks++;
864       rowstart = i;
865     }
866     PetscCall(PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes));
867 
868     /* Set up sendhdrs and sendframes */
869     sendno = 0;
870     for (rowstart = 0; rowstart < nblocks;) {
871       PetscInt       owner;
872       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
873       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
874       if (owner < 0) owner = -(owner + 2);
875       stash->sendranks[sendno] = owner;
876       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
877         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
878         if (sendblock_i->row >= owners[owner + 1]) break;
879       }
880       stash->sendframes[sendno].buffer  = sendblock_rowstart;
881       stash->sendframes[sendno].pending = 0;
882       stash->sendhdr[sendno].count      = i - rowstart;
883       sendno++;
884       rowstart = i;
885     }
886     PetscCheck(sendno == stash->nsendranks, stash->comm, PETSC_ERR_PLIB, "BTS counted %d sendranks, but %" PetscInt_FMT " sends", stash->nsendranks, sendno);
887   }
888 
889   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
890    * message or a dummy entry of some sort. */
891   if (mat->insertmode == INSERT_VALUES) {
892     size_t i;
893     for (i = 0; i < nblocks; i++) {
894       MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
895       sendblock_i->row           = -(sendblock_i->row + 1);
896     }
897   }
898 
899   if (stash->first_assembly_done) {
900     PetscMPIInt i, tag;
901     PetscCall(PetscCommGetNewTag(stash->comm, &tag));
902     for (i = 0; i < stash->nrecvranks; i++) PetscCall(MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash));
903     for (i = 0; i < stash->nsendranks; i++) PetscCall(MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash));
904     stash->use_status = PETSC_TRUE; /* Use count from message status. */
905   } else {
906     PetscCall(PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash));
907     PetscCall(PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses));
908     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
909   }
910 
911   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes));
912   stash->recvframe_active    = NULL;
913   stash->recvframe_i         = 0;
914   stash->some_i              = 0;
915   stash->some_count          = 0;
916   stash->recvcount           = 0;
917   stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
918   stash->insertmode          = &mat->insertmode;
919   PetscFunctionReturn(PETSC_SUCCESS);
920 }
921 
922 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg)
923 {
924   MatStashBlock *block;
925 
926   PetscFunctionBegin;
927   *flg = 0;
928   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
929     if (stash->some_i == stash->some_count) {
930       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(PETSC_SUCCESS); /* Done */
931       PetscCallMPI(MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE));
932       stash->some_i = 0;
933     }
934     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
935     stash->recvframe_count  = stash->recvframe_active->count; /* From header; maximum count */
936     if (stash->use_status) {                                  /* Count what was actually sent */
937       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count));
938     }
939     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
940       block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0];
941       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
942       PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling INSERT_VALUES, but rank %d requested ADD_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
943       PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling ADD_VALUES, but rank %d requested INSERT_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
944     }
945     stash->some_i++;
946     stash->recvcount++;
947     stash->recvframe_i = 0;
948   }
949   *n    = 1;
950   block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size];
951   if (block->row < 0) block->row = -(block->row + 1);
952   *row = &block->row;
953   *col = &block->col;
954   *val = block->vals;
955   stash->recvframe_i++;
956   *flg = 1;
957   PetscFunctionReturn(PETSC_SUCCESS);
958 }
959 
960 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
961 {
962   PetscFunctionBegin;
963   PetscCallMPI(MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE));
964   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
965     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL));
966   } else { /* No reuse, so collect everything. */
967     PetscCall(MatStashScatterDestroy_BTS(stash));
968   }
969 
970   /* Now update nmaxold to be app 10% more than max n used, this way the
971      wastage of space is reduced the next time this stash is used.
972      Also update the oldmax, only if it increases */
973   if (stash->n) {
974     PetscInt bs2     = stash->bs * stash->bs;
975     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
976     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
977   }
978 
979   stash->nmax       = 0;
980   stash->n          = 0;
981   stash->reallocs   = -1;
982   stash->nprocessed = 0;
983 
984   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
985 
986   stash->space = NULL;
987 
988   PetscFunctionReturn(PETSC_SUCCESS);
989 }
990 
991 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
992 {
993   PetscFunctionBegin;
994   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
995   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
996   stash->recvframes = NULL;
997   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
998   if (stash->blocktype != MPI_DATATYPE_NULL) PetscCallMPI(MPI_Type_free(&stash->blocktype));
999   stash->nsendranks = 0;
1000   stash->nrecvranks = 0;
1001   PetscCall(PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes));
1002   PetscCall(PetscFree(stash->sendreqs));
1003   PetscCall(PetscFree(stash->recvreqs));
1004   PetscCall(PetscFree(stash->recvranks));
1005   PetscCall(PetscFree(stash->recvhdr));
1006   PetscCall(PetscFree2(stash->some_indices, stash->some_statuses));
1007   PetscFunctionReturn(PETSC_SUCCESS);
1008 }
1009 #endif
1010