xref: /petsc/src/mat/utils/matstash.c (revision 750b007cd8d816cecd9de99077bb0a703b4cf61a)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE 10000
5 
6 static PetscErrorCode       MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash) {
30   PetscInt  max, *opt, nopt, i;
31   PetscBool flg;
32 
33   PetscFunctionBegin;
34   /* Require 2 tags,get the second using PetscCommGetNewTag() */
35   stash->comm = comm;
36 
37   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag1));
38   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag2));
39   PetscCallMPI(MPI_Comm_size(stash->comm, &stash->size));
40   PetscCallMPI(MPI_Comm_rank(stash->comm, &stash->rank));
41   PetscCall(PetscMalloc1(2 * stash->size, &stash->flg_v));
42   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
43 
44   nopt = stash->size;
45   PetscCall(PetscMalloc1(nopt, &opt));
46   PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg));
47   if (flg) {
48     if (nopt == 1) max = opt[0];
49     else if (nopt == stash->size) max = opt[stash->rank];
50     else if (stash->rank < nopt) max = opt[stash->rank];
51     else max = 0; /* Use default */
52     stash->umax = max;
53   } else {
54     stash->umax = 0;
55   }
56   PetscCall(PetscFree(opt));
57   if (bs <= 0) bs = 1;
58 
59   stash->bs         = bs;
60   stash->nmax       = 0;
61   stash->oldnmax    = 0;
62   stash->n          = 0;
63   stash->reallocs   = -1;
64   stash->space_head = NULL;
65   stash->space      = NULL;
66 
67   stash->send_waits  = NULL;
68   stash->recv_waits  = NULL;
69   stash->send_status = NULL;
70   stash->nsends      = 0;
71   stash->nrecvs      = 0;
72   stash->svalues     = NULL;
73   stash->rvalues     = NULL;
74   stash->rindices    = NULL;
75   stash->nprocessed  = 0;
76   stash->reproduce   = PETSC_FALSE;
77   stash->blocktype   = MPI_DATATYPE_NULL;
78 
79   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL));
80 #if !defined(PETSC_HAVE_MPIUNI)
81   flg = PETSC_FALSE;
82   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL));
83   if (!flg) {
84     stash->ScatterBegin   = MatStashScatterBegin_BTS;
85     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
86     stash->ScatterEnd     = MatStashScatterEnd_BTS;
87     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
88   } else {
89 #endif
90     stash->ScatterBegin   = MatStashScatterBegin_Ref;
91     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
92     stash->ScatterEnd     = MatStashScatterEnd_Ref;
93     stash->ScatterDestroy = NULL;
94 #if !defined(PETSC_HAVE_MPIUNI)
95   }
96 #endif
97   PetscFunctionReturn(0);
98 }
99 
100 /*
101    MatStashDestroy_Private - Destroy the stash
102 */
103 PetscErrorCode MatStashDestroy_Private(MatStash *stash) {
104   PetscFunctionBegin;
105   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
106   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
107 
108   stash->space = NULL;
109 
110   PetscCall(PetscFree(stash->flg_v));
111   PetscFunctionReturn(0);
112 }
113 
114 /*
115    MatStashScatterEnd_Private - This is called as the final stage of
116    scatter. The final stages of message passing is done here, and
117    all the memory used for message passing is cleaned up. This
118    routine also resets the stash, and deallocates the memory used
119    for the stash. It also keeps track of the current memory usage
120    so that the same value can be used the next time through.
121 */
122 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash) {
123   PetscFunctionBegin;
124   PetscCall((*stash->ScatterEnd)(stash));
125   PetscFunctionReturn(0);
126 }
127 
128 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash) {
129   PetscInt    nsends = stash->nsends, bs2, oldnmax, i;
130   MPI_Status *send_status;
131 
132   PetscFunctionBegin;
133   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
134   /* wait on sends */
135   if (nsends) {
136     PetscCall(PetscMalloc1(2 * nsends, &send_status));
137     PetscCallMPI(MPI_Waitall(2 * nsends, stash->send_waits, send_status));
138     PetscCall(PetscFree(send_status));
139   }
140 
141   /* Now update nmaxold to be app 10% more than max n used, this way the
142      wastage of space is reduced the next time this stash is used.
143      Also update the oldmax, only if it increases */
144   if (stash->n) {
145     bs2     = stash->bs * stash->bs;
146     oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
147     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
148   }
149 
150   stash->nmax       = 0;
151   stash->n          = 0;
152   stash->reallocs   = -1;
153   stash->nprocessed = 0;
154 
155   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
156 
157   stash->space = NULL;
158 
159   PetscCall(PetscFree(stash->send_waits));
160   PetscCall(PetscFree(stash->recv_waits));
161   PetscCall(PetscFree2(stash->svalues, stash->sindices));
162   PetscCall(PetscFree(stash->rvalues[0]));
163   PetscCall(PetscFree(stash->rvalues));
164   PetscCall(PetscFree(stash->rindices[0]));
165   PetscCall(PetscFree(stash->rindices));
166   PetscFunctionReturn(0);
167 }
168 
169 /*
170    MatStashGetInfo_Private - Gets the relevant statistics of the stash
171 
172    Input Parameters:
173    stash    - the stash
174    nstash   - the size of the stash. Indicates the number of values stored.
175    reallocs - the number of additional mallocs incurred.
176 
177 */
178 PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs) {
179   PetscInt bs2 = stash->bs * stash->bs;
180 
181   PetscFunctionBegin;
182   if (nstash) *nstash = stash->n * bs2;
183   if (reallocs) {
184     if (stash->reallocs < 0) *reallocs = 0;
185     else *reallocs = stash->reallocs;
186   }
187   PetscFunctionReturn(0);
188 }
189 
190 /*
191    MatStashSetInitialSize_Private - Sets the initial size of the stash
192 
193    Input Parameters:
194    stash  - the stash
195    max    - the value that is used as the max size of the stash.
196             this value is used while allocating memory.
197 */
198 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max) {
199   PetscFunctionBegin;
200   stash->umax = max;
201   PetscFunctionReturn(0);
202 }
203 
204 /* MatStashExpand_Private - Expand the stash. This function is called
205    when the space in the stash is not sufficient to add the new values
206    being inserted into the stash.
207 
208    Input Parameters:
209    stash - the stash
210    incr  - the minimum increase requested
211 
212    Note:
213    This routine doubles the currently used memory.
214  */
215 static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr) {
216   PetscInt newnmax, bs2 = stash->bs * stash->bs;
217 
218   PetscFunctionBegin;
219   /* allocate a larger stash */
220   if (!stash->oldnmax && !stash->nmax) { /* new stash */
221     if (stash->umax) newnmax = stash->umax / bs2;
222     else newnmax = DEFAULT_STASH_SIZE / bs2;
223   } else if (!stash->nmax) { /* resuing stash */
224     if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2;
225     else newnmax = stash->oldnmax / bs2;
226   } else newnmax = stash->nmax * 2;
227   if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr;
228 
229   /* Get a MatStashSpace and attach it to stash */
230   PetscCall(PetscMatStashSpaceGet(bs2, newnmax, &stash->space));
231   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
232     stash->space_head = stash->space;
233   }
234 
235   stash->reallocs++;
236   stash->nmax = newnmax;
237   PetscFunctionReturn(0);
238 }
239 /*
240   MatStashValuesRow_Private - inserts values into the stash. This function
241   expects the values to be roworiented. Multiple columns belong to the same row
242   can be inserted with a single call to this function.
243 
244   Input Parameters:
245   stash  - the stash
246   row    - the global row correspoiding to the values
247   n      - the number of elements inserted. All elements belong to the above row.
248   idxn   - the global column indices corresponding to each of the values.
249   values - the values inserted
250 */
251 PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries) {
252   PetscInt           i, k, cnt = 0;
253   PetscMatStashSpace space = stash->space;
254 
255   PetscFunctionBegin;
256   /* Check and see if we have sufficient memory */
257   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
258   space = stash->space;
259   k     = space->local_used;
260   for (i = 0; i < n; i++) {
261     if (ignorezeroentries && values && values[i] == 0.0) continue;
262     space->idx[k] = row;
263     space->idy[k] = idxn[i];
264     space->val[k] = values ? values[i] : 0.0;
265     k++;
266     cnt++;
267   }
268   stash->n += cnt;
269   space->local_used += cnt;
270   space->local_remaining -= cnt;
271   PetscFunctionReturn(0);
272 }
273 
274 /*
275   MatStashValuesCol_Private - inserts values into the stash. This function
276   expects the values to be columnoriented. Multiple columns belong to the same row
277   can be inserted with a single call to this function.
278 
279   Input Parameters:
280   stash   - the stash
281   row     - the global row correspoiding to the values
282   n       - the number of elements inserted. All elements belong to the above row.
283   idxn    - the global column indices corresponding to each of the values.
284   values  - the values inserted
285   stepval - the consecutive values are sepated by a distance of stepval.
286             this happens because the input is columnoriented.
287 */
288 PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries) {
289   PetscInt           i, k, cnt = 0;
290   PetscMatStashSpace space = stash->space;
291 
292   PetscFunctionBegin;
293   /* Check and see if we have sufficient memory */
294   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
295   space = stash->space;
296   k     = space->local_used;
297   for (i = 0; i < n; i++) {
298     if (ignorezeroentries && values && values[i * stepval] == 0.0) continue;
299     space->idx[k] = row;
300     space->idy[k] = idxn[i];
301     space->val[k] = values ? values[i * stepval] : 0.0;
302     k++;
303     cnt++;
304   }
305   stash->n += cnt;
306   space->local_used += cnt;
307   space->local_remaining -= cnt;
308   PetscFunctionReturn(0);
309 }
310 
311 /*
312   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
313   This function expects the values to be roworiented. Multiple columns belong
314   to the same block-row can be inserted with a single call to this function.
315   This function extracts the sub-block of values based on the dimensions of
316   the original input block, and the row,col values corresponding to the blocks.
317 
318   Input Parameters:
319   stash  - the stash
320   row    - the global block-row correspoiding to the values
321   n      - the number of elements inserted. All elements belong to the above row.
322   idxn   - the global block-column indices corresponding to each of the blocks of
323            values. Each block is of size bs*bs.
324   values - the values inserted
325   rmax   - the number of block-rows in the original block.
326   cmax   - the number of block-columns on the original block.
327   idx    - the index of the current block-row in the original block.
328 */
329 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) {
330   PetscInt           i, j, k, bs2, bs = stash->bs, l;
331   const PetscScalar *vals;
332   PetscScalar       *array;
333   PetscMatStashSpace space = stash->space;
334 
335   PetscFunctionBegin;
336   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
337   space = stash->space;
338   l     = space->local_used;
339   bs2   = bs * bs;
340   for (i = 0; i < n; i++) {
341     space->idx[l] = row;
342     space->idy[l] = idxn[i];
343     /* Now copy over the block of values. Store the values column oriented.
344        This enables inserting multiple blocks belonging to a row with a single
345        funtion call */
346     array         = space->val + bs2 * l;
347     vals          = values + idx * bs2 * n + bs * i;
348     for (j = 0; j < bs; j++) {
349       for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0;
350       array++;
351       vals += cmax * bs;
352     }
353     l++;
354   }
355   stash->n += n;
356   space->local_used += n;
357   space->local_remaining -= n;
358   PetscFunctionReturn(0);
359 }
360 
361 /*
362   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
363   This function expects the values to be roworiented. Multiple columns belong
364   to the same block-row can be inserted with a single call to this function.
365   This function extracts the sub-block of values based on the dimensions of
366   the original input block, and the row,col values corresponding to the blocks.
367 
368   Input Parameters:
369   stash  - the stash
370   row    - the global block-row correspoiding to the values
371   n      - the number of elements inserted. All elements belong to the above row.
372   idxn   - the global block-column indices corresponding to each of the blocks of
373            values. Each block is of size bs*bs.
374   values - the values inserted
375   rmax   - the number of block-rows in the original block.
376   cmax   - the number of block-columns on the original block.
377   idx    - the index of the current block-row in the original block.
378 */
379 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx) {
380   PetscInt           i, j, k, bs2, bs = stash->bs, l;
381   const PetscScalar *vals;
382   PetscScalar       *array;
383   PetscMatStashSpace space = stash->space;
384 
385   PetscFunctionBegin;
386   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
387   space = stash->space;
388   l     = space->local_used;
389   bs2   = bs * bs;
390   for (i = 0; i < n; i++) {
391     space->idx[l] = row;
392     space->idy[l] = idxn[i];
393     /* Now copy over the block of values. Store the values column oriented.
394      This enables inserting multiple blocks belonging to a row with a single
395      funtion call */
396     array         = space->val + bs2 * l;
397     vals          = values + idx * bs2 * n + bs * i;
398     for (j = 0; j < bs; j++) {
399       for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0;
400       array += bs;
401       vals += rmax * bs;
402     }
403     l++;
404   }
405   stash->n += n;
406   space->local_used += n;
407   space->local_remaining -= n;
408   PetscFunctionReturn(0);
409 }
410 /*
411   MatStashScatterBegin_Private - Initiates the transfer of values to the
412   correct owners. This function goes through the stash, and check the
413   owners of each stashed value, and sends the values off to the owner
414   processors.
415 
416   Input Parameters:
417   stash  - the stash
418   owners - an array of size 'no-of-procs' which gives the ownership range
419            for each node.
420 
421   Note:
422     The 'owners' array in the cased of the blocked-stash has the
423   ranges specified blocked global indices, and for the regular stash in
424   the proper global indices.
425 */
426 PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners) {
427   PetscFunctionBegin;
428   PetscCall((*stash->ScatterBegin)(mat, stash, owners));
429   PetscFunctionReturn(0);
430 }
431 
432 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners) {
433   PetscInt          *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2;
434   PetscInt           size = stash->size, nsends;
435   PetscInt           count, *sindices, **rindices, i, j, idx, lastidx, l;
436   PetscScalar      **rvalues, *svalues;
437   MPI_Comm           comm = stash->comm;
438   MPI_Request       *send_waits, *recv_waits, *recv_waits1, *recv_waits2;
439   PetscMPIInt       *sizes, *nlengths, nreceives;
440   PetscInt          *sp_idx, *sp_idy;
441   PetscScalar       *sp_val;
442   PetscMatStashSpace space, space_next;
443 
444   PetscFunctionBegin;
445   { /* make sure all processors are either in INSERTMODE or ADDMODE */
446     InsertMode addv;
447     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
448     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
449     mat->insertmode = addv; /* in case this processor had no cache */
450   }
451 
452   bs2 = stash->bs * stash->bs;
453 
454   /*  first count number of contributors to each processor */
455   PetscCall(PetscCalloc1(size, &nlengths));
456   PetscCall(PetscMalloc1(stash->n + 1, &owner));
457 
458   i = j   = 0;
459   lastidx = -1;
460   space   = stash->space_head;
461   while (space) {
462     space_next = space->next;
463     sp_idx     = space->idx;
464     for (l = 0; l < space->local_used; l++) {
465       /* if indices are NOT locally sorted, need to start search at the beginning */
466       if (lastidx > (idx = sp_idx[l])) j = 0;
467       lastidx = idx;
468       for (; j < size; j++) {
469         if (idx >= owners[j] && idx < owners[j + 1]) {
470           nlengths[j]++;
471           owner[i] = j;
472           break;
473         }
474       }
475       i++;
476     }
477     space = space_next;
478   }
479 
480   /* Now check what procs get messages - and compute nsends. */
481   PetscCall(PetscCalloc1(size, &sizes));
482   for (i = 0, nsends = 0; i < size; i++) {
483     if (nlengths[i]) {
484       sizes[i] = 1;
485       nsends++;
486     }
487   }
488 
489   {
490     PetscMPIInt *onodes, *olengths;
491     /* Determine the number of messages to expect, their lengths, from from-ids */
492     PetscCall(PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives));
493     PetscCall(PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths));
494     /* since clubbing row,col - lengths are multiplied by 2 */
495     for (i = 0; i < nreceives; i++) olengths[i] *= 2;
496     PetscCall(PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1));
497     /* values are size 'bs2' lengths (and remove earlier factor 2 */
498     for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2;
499     PetscCall(PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2));
500     PetscCall(PetscFree(onodes));
501     PetscCall(PetscFree(olengths));
502   }
503 
504   /* do sends:
505       1) starts[i] gives the starting index in svalues for stuff going to
506          the ith processor
507   */
508   PetscCall(PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices));
509   PetscCall(PetscMalloc1(2 * nsends, &send_waits));
510   PetscCall(PetscMalloc2(size, &startv, size, &starti));
511   /* use 2 sends the first with all_a, the next with all_i and all_j */
512   startv[0] = 0;
513   starti[0] = 0;
514   for (i = 1; i < size; i++) {
515     startv[i] = startv[i - 1] + nlengths[i - 1];
516     starti[i] = starti[i - 1] + 2 * nlengths[i - 1];
517   }
518 
519   i     = 0;
520   space = stash->space_head;
521   while (space) {
522     space_next = space->next;
523     sp_idx     = space->idx;
524     sp_idy     = space->idy;
525     sp_val     = space->val;
526     for (l = 0; l < space->local_used; l++) {
527       j = owner[i];
528       if (bs2 == 1) {
529         svalues[startv[j]] = sp_val[l];
530       } else {
531         PetscInt     k;
532         PetscScalar *buf1, *buf2;
533         buf1 = svalues + bs2 * startv[j];
534         buf2 = space->val + bs2 * l;
535         for (k = 0; k < bs2; k++) buf1[k] = buf2[k];
536       }
537       sindices[starti[j]]               = sp_idx[l];
538       sindices[starti[j] + nlengths[j]] = sp_idy[l];
539       startv[j]++;
540       starti[j]++;
541       i++;
542     }
543     space = space_next;
544   }
545   startv[0] = 0;
546   for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1];
547 
548   for (i = 0, count = 0; i < size; i++) {
549     if (sizes[i]) {
550       PetscCallMPI(MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++));
551       PetscCallMPI(MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++));
552     }
553   }
554 #if defined(PETSC_USE_INFO)
555   PetscCall(PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends));
556   for (i = 0; i < size; i++) {
557     if (sizes[i]) PetscCall(PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt)))));
558   }
559 #endif
560   PetscCall(PetscFree(nlengths));
561   PetscCall(PetscFree(owner));
562   PetscCall(PetscFree2(startv, starti));
563   PetscCall(PetscFree(sizes));
564 
565   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
566   PetscCall(PetscMalloc1(2 * nreceives, &recv_waits));
567 
568   for (i = 0; i < nreceives; i++) {
569     recv_waits[2 * i]     = recv_waits1[i];
570     recv_waits[2 * i + 1] = recv_waits2[i];
571   }
572   stash->recv_waits = recv_waits;
573 
574   PetscCall(PetscFree(recv_waits1));
575   PetscCall(PetscFree(recv_waits2));
576 
577   stash->svalues         = svalues;
578   stash->sindices        = sindices;
579   stash->rvalues         = rvalues;
580   stash->rindices        = rindices;
581   stash->send_waits      = send_waits;
582   stash->nsends          = nsends;
583   stash->nrecvs          = nreceives;
584   stash->reproduce_count = 0;
585   PetscFunctionReturn(0);
586 }
587 
588 /*
589    MatStashScatterGetMesg_Private - This function waits on the receives posted
590    in the function MatStashScatterBegin_Private() and returns one message at
591    a time to the calling function. If no messages are left, it indicates this
592    by setting flg = 0, else it sets flg = 1.
593 
594    Input Parameters:
595    stash - the stash
596 
597    Output Parameters:
598    nvals - the number of entries in the current message.
599    rows  - an array of row indices (or blocked indices) corresponding to the values
600    cols  - an array of columnindices (or blocked indices) corresponding to the values
601    vals  - the values
602    flg   - 0 indicates no more message left, and the current call has no values associated.
603            1 indicates that the current call successfully received a message, and the
604              other output parameters nvals,rows,cols,vals are set appropriately.
605 */
606 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) {
607   PetscFunctionBegin;
608   PetscCall((*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg));
609   PetscFunctionReturn(0);
610 }
611 
612 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg) {
613   PetscMPIInt i, *flg_v = stash->flg_v, i1, i2;
614   PetscInt    bs2;
615   MPI_Status  recv_status;
616   PetscBool   match_found = PETSC_FALSE;
617 
618   PetscFunctionBegin;
619   *flg = 0; /* When a message is discovered this is reset to 1 */
620   /* Return if no more messages to process */
621   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
622 
623   bs2 = stash->bs * stash->bs;
624   /* If a matching pair of receives are found, process them, and return the data to
625      the calling function. Until then keep receiving messages */
626   while (!match_found) {
627     if (stash->reproduce) {
628       i = stash->reproduce_count++;
629       PetscCallMPI(MPI_Wait(stash->recv_waits + i, &recv_status));
630     } else {
631       PetscCallMPI(MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status));
632     }
633     PetscCheck(recv_status.MPI_SOURCE >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Negative MPI source!");
634 
635     /* Now pack the received message into a structure which is usable by others */
636     if (i % 2) {
637       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_SCALAR, nvals));
638       flg_v[2 * recv_status.MPI_SOURCE] = i / 2;
639       *nvals                            = *nvals / bs2;
640     } else {
641       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_INT, nvals));
642       flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2;
643       *nvals                                = *nvals / 2; /* This message has both row indices and col indices */
644     }
645 
646     /* Check if we have both messages from this proc */
647     i1 = flg_v[2 * recv_status.MPI_SOURCE];
648     i2 = flg_v[2 * recv_status.MPI_SOURCE + 1];
649     if (i1 != -1 && i2 != -1) {
650       *rows = stash->rindices[i2];
651       *cols = *rows + *nvals;
652       *vals = stash->rvalues[i1];
653       *flg  = 1;
654       stash->nprocessed++;
655       match_found = PETSC_TRUE;
656     }
657   }
658   PetscFunctionReturn(0);
659 }
660 
661 #if !defined(PETSC_HAVE_MPIUNI)
662 typedef struct {
663   PetscInt    row;
664   PetscInt    col;
665   PetscScalar vals[1]; /* Actually an array of length bs2 */
666 } MatStashBlock;
667 
668 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode) {
669   PetscMatStashSpace space;
670   PetscInt           n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i;
671   PetscScalar      **valptr;
672 
673   PetscFunctionBegin;
674   PetscCall(PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm));
675   for (space = stash->space_head, cnt = 0; space; space = space->next) {
676     for (i = 0; i < space->local_used; i++) {
677       row[cnt]    = space->idx[i];
678       col[cnt]    = space->idy[i];
679       valptr[cnt] = &space->val[i * bs2];
680       perm[cnt]   = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */
681       cnt++;
682     }
683   }
684   PetscCheck(cnt == n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries", n, cnt);
685   PetscCall(PetscSortIntWithArrayPair(n, row, col, perm));
686   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
687   for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) {
688     if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */
689       PetscInt colstart;
690       PetscCall(PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart]));
691       for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */
692         PetscInt       j, l;
693         MatStashBlock *block;
694         PetscCall(PetscSegBufferGet(stash->segsendblocks, 1, &block));
695         block->row = row[rowstart];
696         block->col = col[colstart];
697         PetscCall(PetscArraycpy(block->vals, valptr[perm[colstart]], bs2));
698         for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
699           if (insertmode == ADD_VALUES) {
700             for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l];
701           } else {
702             PetscCall(PetscArraycpy(block->vals, valptr[perm[j]], bs2));
703           }
704         }
705         colstart = j;
706       }
707       rowstart = i;
708     }
709   }
710   PetscCall(PetscFree4(row, col, valptr, perm));
711   PetscFunctionReturn(0);
712 }
713 
714 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) {
715   PetscFunctionBegin;
716   if (stash->blocktype == MPI_DATATYPE_NULL) {
717     PetscInt     bs2 = PetscSqr(stash->bs);
718     PetscMPIInt  blocklens[2];
719     MPI_Aint     displs[2];
720     MPI_Datatype types[2], stype;
721     /*
722         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
723        std::complex itself has standard layout, so does DummyBlock, recursively.
724        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
725        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
726        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
727 
728        We can test if std::complex has standard layout with the following code:
729        #include <iostream>
730        #include <type_traits>
731        #include <complex>
732        int main() {
733          std::cout << std::boolalpha;
734          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
735        }
736        Output: true
737      */
738     struct DummyBlock {
739       PetscInt    row, col;
740       PetscScalar vals;
741     };
742 
743     stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar);
744     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
745       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
746     }
747     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks));
748     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks));
749     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe));
750     blocklens[0] = 2;
751     blocklens[1] = bs2;
752     displs[0]    = offsetof(struct DummyBlock, row);
753     displs[1]    = offsetof(struct DummyBlock, vals);
754     types[0]     = MPIU_INT;
755     types[1]     = MPIU_SCALAR;
756     PetscCallMPI(MPI_Type_create_struct(2, blocklens, displs, types, &stype));
757     PetscCallMPI(MPI_Type_commit(&stype));
758     PetscCallMPI(MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype));
759     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
760     PetscCallMPI(MPI_Type_free(&stype));
761   }
762   PetscFunctionReturn(0);
763 }
764 
765 /* Callback invoked after target rank has initiatied receive of rendezvous message.
766  * Here we post the main sends.
767  */
768 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx) {
769   MatStash       *stash = (MatStash *)ctx;
770   MatStashHeader *hdr   = (MatStashHeader *)sdata;
771 
772   PetscFunctionBegin;
773   PetscCheck(rank == stash->sendranks[rankid], comm, PETSC_ERR_PLIB, "BTS Send rank %d does not match sendranks[%d] %d", rank, rankid, stash->sendranks[rankid]);
774   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
775   stash->sendframes[rankid].count   = hdr->count;
776   stash->sendframes[rankid].pending = 1;
777   PetscFunctionReturn(0);
778 }
779 
780 /*
781     Callback invoked by target after receiving rendezvous message.
782     Here we post the main recvs.
783  */
784 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx) {
785   MatStash       *stash = (MatStash *)ctx;
786   MatStashHeader *hdr   = (MatStashHeader *)rdata;
787   MatStashFrame  *frame;
788 
789   PetscFunctionBegin;
790   PetscCall(PetscSegBufferGet(stash->segrecvframe, 1, &frame));
791   PetscCall(PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer));
792   PetscCallMPI(MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
793   frame->count   = hdr->count;
794   frame->pending = 1;
795   PetscFunctionReturn(0);
796 }
797 
798 /*
799  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
800  */
801 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[]) {
802   size_t nblocks;
803   char  *sendblocks;
804 
805   PetscFunctionBegin;
806   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
807     InsertMode addv;
808     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
809     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
810   }
811 
812   PetscCall(MatStashBlockTypeSetUp(stash));
813   PetscCall(MatStashSortCompress_Private(stash, mat->insertmode));
814   PetscCall(PetscSegBufferGetSize(stash->segsendblocks, &nblocks));
815   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks));
816   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
817     PetscInt i;
818     size_t   b;
819     for (i = 0, b = 0; i < stash->nsendranks; i++) {
820       stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size];
821       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
822       stash->sendhdr[i].count     = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
823       for (; b < nblocks; b++) {
824         MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size];
825         PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]], stash->comm, PETSC_ERR_ARG_WRONG, "MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly", sendblock_b->row, stash->sendranks[i]);
826         if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break;
827         stash->sendhdr[i].count++;
828       }
829     }
830   } else { /* Dynamically count and pack (first time) */
831     PetscInt sendno;
832     size_t   i, rowstart;
833 
834     /* Count number of send ranks and allocate for sends */
835     stash->nsendranks = 0;
836     for (rowstart = 0; rowstart < nblocks;) {
837       PetscInt       owner;
838       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
839       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
840       if (owner < 0) owner = -(owner + 2);
841       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
842         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
843         if (sendblock_i->row >= owners[owner + 1]) break;
844       }
845       stash->nsendranks++;
846       rowstart = i;
847     }
848     PetscCall(PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes));
849 
850     /* Set up sendhdrs and sendframes */
851     sendno = 0;
852     for (rowstart = 0; rowstart < nblocks;) {
853       PetscInt       owner;
854       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
855       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
856       if (owner < 0) owner = -(owner + 2);
857       stash->sendranks[sendno] = owner;
858       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
859         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
860         if (sendblock_i->row >= owners[owner + 1]) break;
861       }
862       stash->sendframes[sendno].buffer  = sendblock_rowstart;
863       stash->sendframes[sendno].pending = 0;
864       stash->sendhdr[sendno].count      = i - rowstart;
865       sendno++;
866       rowstart = i;
867     }
868     PetscCheck(sendno == stash->nsendranks, stash->comm, PETSC_ERR_PLIB, "BTS counted %d sendranks, but %" PetscInt_FMT " sends", stash->nsendranks, sendno);
869   }
870 
871   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
872    * message or a dummy entry of some sort. */
873   if (mat->insertmode == INSERT_VALUES) {
874     size_t i;
875     for (i = 0; i < nblocks; i++) {
876       MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
877       sendblock_i->row           = -(sendblock_i->row + 1);
878     }
879   }
880 
881   if (stash->first_assembly_done) {
882     PetscMPIInt i, tag;
883     PetscCall(PetscCommGetNewTag(stash->comm, &tag));
884     for (i = 0; i < stash->nrecvranks; i++) PetscCall(MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash));
885     for (i = 0; i < stash->nsendranks; i++) PetscCall(MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash));
886     stash->use_status = PETSC_TRUE; /* Use count from message status. */
887   } else {
888     PetscCall(PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash));
889     PetscCall(PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses));
890     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
891   }
892 
893   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes));
894   stash->recvframe_active    = NULL;
895   stash->recvframe_i         = 0;
896   stash->some_i              = 0;
897   stash->some_count          = 0;
898   stash->recvcount           = 0;
899   stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
900   stash->insertmode          = &mat->insertmode;
901   PetscFunctionReturn(0);
902 }
903 
904 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg) {
905   MatStashBlock *block;
906 
907   PetscFunctionBegin;
908   *flg = 0;
909   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
910     if (stash->some_i == stash->some_count) {
911       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
912       PetscCallMPI(MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE));
913       stash->some_i = 0;
914     }
915     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
916     stash->recvframe_count  = stash->recvframe_active->count; /* From header; maximum count */
917     if (stash->use_status) {                                  /* Count what was actually sent */
918       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count));
919     }
920     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
921       block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0];
922       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
923       PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling INSERT_VALUES, but rank %d requested ADD_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
924       PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling ADD_VALUES, but rank %d requested INSERT_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
925     }
926     stash->some_i++;
927     stash->recvcount++;
928     stash->recvframe_i = 0;
929   }
930   *n    = 1;
931   block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size];
932   if (block->row < 0) block->row = -(block->row + 1);
933   *row = &block->row;
934   *col = &block->col;
935   *val = block->vals;
936   stash->recvframe_i++;
937   *flg = 1;
938   PetscFunctionReturn(0);
939 }
940 
941 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash) {
942   PetscFunctionBegin;
943   PetscCallMPI(MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE));
944   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
945     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL));
946   } else { /* No reuse, so collect everything. */
947     PetscCall(MatStashScatterDestroy_BTS(stash));
948   }
949 
950   /* Now update nmaxold to be app 10% more than max n used, this way the
951      wastage of space is reduced the next time this stash is used.
952      Also update the oldmax, only if it increases */
953   if (stash->n) {
954     PetscInt bs2     = stash->bs * stash->bs;
955     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
956     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
957   }
958 
959   stash->nmax       = 0;
960   stash->n          = 0;
961   stash->reallocs   = -1;
962   stash->nprocessed = 0;
963 
964   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
965 
966   stash->space = NULL;
967 
968   PetscFunctionReturn(0);
969 }
970 
971 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash) {
972   PetscFunctionBegin;
973   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
974   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
975   stash->recvframes = NULL;
976   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
977   if (stash->blocktype != MPI_DATATYPE_NULL) PetscCallMPI(MPI_Type_free(&stash->blocktype));
978   stash->nsendranks = 0;
979   stash->nrecvranks = 0;
980   PetscCall(PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes));
981   PetscCall(PetscFree(stash->sendreqs));
982   PetscCall(PetscFree(stash->recvreqs));
983   PetscCall(PetscFree(stash->recvranks));
984   PetscCall(PetscFree(stash->recvhdr));
985   PetscCall(PetscFree2(stash->some_indices, stash->some_statuses));
986   PetscFunctionReturn(0);
987 }
988 #endif
989