xref: /petsc/src/mat/utils/matstash.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE 10000
5 
6 static PetscErrorCode       MatStashScatterBegin_Ref(Mat, MatStash *, PetscInt *);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat, MatStash *, PetscInt *);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *, PetscMPIInt *, PetscInt **, PetscInt **, PetscScalar **, PetscInt *);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm, PetscInt bs, MatStash *stash)
30 {
31   PetscInt  max, *opt, nopt, i;
32   PetscBool flg;
33 
34   PetscFunctionBegin;
35   /* Require 2 tags,get the second using PetscCommGetNewTag() */
36   stash->comm = comm;
37 
38   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag1));
39   PetscCall(PetscCommGetNewTag(stash->comm, &stash->tag2));
40   PetscCallMPI(MPI_Comm_size(stash->comm, &stash->size));
41   PetscCallMPI(MPI_Comm_rank(stash->comm, &stash->rank));
42   PetscCall(PetscMalloc1(2 * stash->size, &stash->flg_v));
43   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
44 
45   nopt = stash->size;
46   PetscCall(PetscMalloc1(nopt, &opt));
47   PetscCall(PetscOptionsGetIntArray(NULL, NULL, "-matstash_initial_size", opt, &nopt, &flg));
48   if (flg) {
49     if (nopt == 1) max = opt[0];
50     else if (nopt == stash->size) max = opt[stash->rank];
51     else if (stash->rank < nopt) max = opt[stash->rank];
52     else max = 0; /* Use default */
53     stash->umax = max;
54   } else {
55     stash->umax = 0;
56   }
57   PetscCall(PetscFree(opt));
58   if (bs <= 0) bs = 1;
59 
60   stash->bs         = bs;
61   stash->nmax       = 0;
62   stash->oldnmax    = 0;
63   stash->n          = 0;
64   stash->reallocs   = -1;
65   stash->space_head = NULL;
66   stash->space      = NULL;
67 
68   stash->send_waits  = NULL;
69   stash->recv_waits  = NULL;
70   stash->send_status = NULL;
71   stash->nsends      = 0;
72   stash->nrecvs      = 0;
73   stash->svalues     = NULL;
74   stash->rvalues     = NULL;
75   stash->rindices    = NULL;
76   stash->nprocessed  = 0;
77   stash->reproduce   = PETSC_FALSE;
78   stash->blocktype   = MPI_DATATYPE_NULL;
79 
80   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_reproduce", &stash->reproduce, NULL));
81 #if !defined(PETSC_HAVE_MPIUNI)
82   flg = PETSC_FALSE;
83   PetscCall(PetscOptionsGetBool(NULL, NULL, "-matstash_legacy", &flg, NULL));
84   if (!flg) {
85     stash->ScatterBegin   = MatStashScatterBegin_BTS;
86     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
87     stash->ScatterEnd     = MatStashScatterEnd_BTS;
88     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
89   } else {
90 #endif
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95 #if !defined(PETSC_HAVE_MPIUNI)
96   }
97 #endif
98   PetscFunctionReturn(0);
99 }
100 
101 /*
102    MatStashDestroy_Private - Destroy the stash
103 */
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscFunctionBegin;
107   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
108   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
109 
110   stash->space = NULL;
111 
112   PetscCall(PetscFree(stash->flg_v));
113   PetscFunctionReturn(0);
114 }
115 
116 /*
117    MatStashScatterEnd_Private - This is called as the final stage of
118    scatter. The final stages of message passing is done here, and
119    all the memory used for message passing is cleaned up. This
120    routine also resets the stash, and deallocates the memory used
121    for the stash. It also keeps track of the current memory usage
122    so that the same value can be used the next time through.
123 */
124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
125 {
126   PetscFunctionBegin;
127   PetscCall((*stash->ScatterEnd)(stash));
128   PetscFunctionReturn(0);
129 }
130 
131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
132 {
133   PetscInt    nsends = stash->nsends, bs2, oldnmax, i;
134   MPI_Status *send_status;
135 
136   PetscFunctionBegin;
137   for (i = 0; i < 2 * stash->size; i++) stash->flg_v[i] = -1;
138   /* wait on sends */
139   if (nsends) {
140     PetscCall(PetscMalloc1(2 * nsends, &send_status));
141     PetscCallMPI(MPI_Waitall(2 * nsends, stash->send_waits, send_status));
142     PetscCall(PetscFree(send_status));
143   }
144 
145   /* Now update nmaxold to be app 10% more than max n used, this way the
146      wastage of space is reduced the next time this stash is used.
147      Also update the oldmax, only if it increases */
148   if (stash->n) {
149     bs2     = stash->bs * stash->bs;
150     oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
151     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
152   }
153 
154   stash->nmax       = 0;
155   stash->n          = 0;
156   stash->reallocs   = -1;
157   stash->nprocessed = 0;
158 
159   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
160 
161   stash->space = NULL;
162 
163   PetscCall(PetscFree(stash->send_waits));
164   PetscCall(PetscFree(stash->recv_waits));
165   PetscCall(PetscFree2(stash->svalues, stash->sindices));
166   PetscCall(PetscFree(stash->rvalues[0]));
167   PetscCall(PetscFree(stash->rvalues));
168   PetscCall(PetscFree(stash->rindices[0]));
169   PetscCall(PetscFree(stash->rindices));
170   PetscFunctionReturn(0);
171 }
172 
173 /*
174    MatStashGetInfo_Private - Gets the relevant statistics of the stash
175 
176    Input Parameters:
177    stash    - the stash
178    nstash   - the size of the stash. Indicates the number of values stored.
179    reallocs - the number of additional mallocs incurred.
180 
181 */
182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash, PetscInt *nstash, PetscInt *reallocs)
183 {
184   PetscInt bs2 = stash->bs * stash->bs;
185 
186   PetscFunctionBegin;
187   if (nstash) *nstash = stash->n * bs2;
188   if (reallocs) {
189     if (stash->reallocs < 0) *reallocs = 0;
190     else *reallocs = stash->reallocs;
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*
196    MatStashSetInitialSize_Private - Sets the initial size of the stash
197 
198    Input Parameters:
199    stash  - the stash
200    max    - the value that is used as the max size of the stash.
201             this value is used while allocating memory.
202 */
203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash, PetscInt max)
204 {
205   PetscFunctionBegin;
206   stash->umax = max;
207   PetscFunctionReturn(0);
208 }
209 
210 /* MatStashExpand_Private - Expand the stash. This function is called
211    when the space in the stash is not sufficient to add the new values
212    being inserted into the stash.
213 
214    Input Parameters:
215    stash - the stash
216    incr  - the minimum increase requested
217 
218    Note:
219    This routine doubles the currently used memory.
220  */
221 static PetscErrorCode MatStashExpand_Private(MatStash *stash, PetscInt incr)
222 {
223   PetscInt newnmax, bs2 = stash->bs * stash->bs;
224 
225   PetscFunctionBegin;
226   /* allocate a larger stash */
227   if (!stash->oldnmax && !stash->nmax) { /* new stash */
228     if (stash->umax) newnmax = stash->umax / bs2;
229     else newnmax = DEFAULT_STASH_SIZE / bs2;
230   } else if (!stash->nmax) { /* resuing stash */
231     if (stash->umax > stash->oldnmax) newnmax = stash->umax / bs2;
232     else newnmax = stash->oldnmax / bs2;
233   } else newnmax = stash->nmax * 2;
234   if (newnmax < (stash->nmax + incr)) newnmax += 2 * incr;
235 
236   /* Get a MatStashSpace and attach it to stash */
237   PetscCall(PetscMatStashSpaceGet(bs2, newnmax, &stash->space));
238   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
239     stash->space_head = stash->space;
240   }
241 
242   stash->reallocs++;
243   stash->nmax = newnmax;
244   PetscFunctionReturn(0);
245 }
246 /*
247   MatStashValuesRow_Private - inserts values into the stash. This function
248   expects the values to be roworiented. Multiple columns belong to the same row
249   can be inserted with a single call to this function.
250 
251   Input Parameters:
252   stash  - the stash
253   row    - the global row correspoiding to the values
254   n      - the number of elements inserted. All elements belong to the above row.
255   idxn   - the global column indices corresponding to each of the values.
256   values - the values inserted
257 */
258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscBool ignorezeroentries)
259 {
260   PetscInt           i, k, cnt = 0;
261   PetscMatStashSpace space = stash->space;
262 
263   PetscFunctionBegin;
264   /* Check and see if we have sufficient memory */
265   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
266   space = stash->space;
267   k     = space->local_used;
268   for (i = 0; i < n; i++) {
269     if (ignorezeroentries && values && values[i] == 0.0) continue;
270     space->idx[k] = row;
271     space->idy[k] = idxn[i];
272     space->val[k] = values ? values[i] : 0.0;
273     k++;
274     cnt++;
275   }
276   stash->n += cnt;
277   space->local_used += cnt;
278   space->local_remaining -= cnt;
279   PetscFunctionReturn(0);
280 }
281 
282 /*
283   MatStashValuesCol_Private - inserts values into the stash. This function
284   expects the values to be columnoriented. Multiple columns belong to the same row
285   can be inserted with a single call to this function.
286 
287   Input Parameters:
288   stash   - the stash
289   row     - the global row correspoiding to the values
290   n       - the number of elements inserted. All elements belong to the above row.
291   idxn    - the global column indices corresponding to each of the values.
292   values  - the values inserted
293   stepval - the consecutive values are sepated by a distance of stepval.
294             this happens because the input is columnoriented.
295 */
296 PetscErrorCode MatStashValuesCol_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt stepval, PetscBool ignorezeroentries)
297 {
298   PetscInt           i, k, cnt = 0;
299   PetscMatStashSpace space = stash->space;
300 
301   PetscFunctionBegin;
302   /* Check and see if we have sufficient memory */
303   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
304   space = stash->space;
305   k     = space->local_used;
306   for (i = 0; i < n; i++) {
307     if (ignorezeroentries && values && values[i * stepval] == 0.0) continue;
308     space->idx[k] = row;
309     space->idy[k] = idxn[i];
310     space->val[k] = values ? values[i * stepval] : 0.0;
311     k++;
312     cnt++;
313   }
314   stash->n += cnt;
315   space->local_used += cnt;
316   space->local_remaining -= cnt;
317   PetscFunctionReturn(0);
318 }
319 
320 /*
321   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
322   This function expects the values to be roworiented. Multiple columns belong
323   to the same block-row can be inserted with a single call to this function.
324   This function extracts the sub-block of values based on the dimensions of
325   the original input block, and the row,col values corresponding to the blocks.
326 
327   Input Parameters:
328   stash  - the stash
329   row    - the global block-row correspoiding to the values
330   n      - the number of elements inserted. All elements belong to the above row.
331   idxn   - the global block-column indices corresponding to each of the blocks of
332            values. Each block is of size bs*bs.
333   values - the values inserted
334   rmax   - the number of block-rows in the original block.
335   cmax   - the number of block-columns on the original block.
336   idx    - the index of the current block-row in the original block.
337 */
338 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
339 {
340   PetscInt           i, j, k, bs2, bs = stash->bs, l;
341   const PetscScalar *vals;
342   PetscScalar       *array;
343   PetscMatStashSpace space = stash->space;
344 
345   PetscFunctionBegin;
346   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
347   space = stash->space;
348   l     = space->local_used;
349   bs2   = bs * bs;
350   for (i = 0; i < n; i++) {
351     space->idx[l] = row;
352     space->idy[l] = idxn[i];
353     /* Now copy over the block of values. Store the values column oriented.
354        This enables inserting multiple blocks belonging to a row with a single
355        funtion call */
356     array = space->val + bs2 * l;
357     vals  = values + idx * bs2 * n + bs * i;
358     for (j = 0; j < bs; j++) {
359       for (k = 0; k < bs; k++) array[k * bs] = values ? vals[k] : 0.0;
360       array++;
361       vals += cmax * bs;
362     }
363     l++;
364   }
365   stash->n += n;
366   space->local_used += n;
367   space->local_remaining -= n;
368   PetscFunctionReturn(0);
369 }
370 
371 /*
372   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
373   This function expects the values to be roworiented. Multiple columns belong
374   to the same block-row can be inserted with a single call to this function.
375   This function extracts the sub-block of values based on the dimensions of
376   the original input block, and the row,col values corresponding to the blocks.
377 
378   Input Parameters:
379   stash  - the stash
380   row    - the global block-row correspoiding to the values
381   n      - the number of elements inserted. All elements belong to the above row.
382   idxn   - the global block-column indices corresponding to each of the blocks of
383            values. Each block is of size bs*bs.
384   values - the values inserted
385   rmax   - the number of block-rows in the original block.
386   cmax   - the number of block-columns on the original block.
387   idx    - the index of the current block-row in the original block.
388 */
389 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash, PetscInt row, PetscInt n, const PetscInt idxn[], const PetscScalar values[], PetscInt rmax, PetscInt cmax, PetscInt idx)
390 {
391   PetscInt           i, j, k, bs2, bs = stash->bs, l;
392   const PetscScalar *vals;
393   PetscScalar       *array;
394   PetscMatStashSpace space = stash->space;
395 
396   PetscFunctionBegin;
397   if (!space || space->local_remaining < n) PetscCall(MatStashExpand_Private(stash, n));
398   space = stash->space;
399   l     = space->local_used;
400   bs2   = bs * bs;
401   for (i = 0; i < n; i++) {
402     space->idx[l] = row;
403     space->idy[l] = idxn[i];
404     /* Now copy over the block of values. Store the values column oriented.
405      This enables inserting multiple blocks belonging to a row with a single
406      funtion call */
407     array = space->val + bs2 * l;
408     vals  = values + idx * bs2 * n + bs * i;
409     for (j = 0; j < bs; j++) {
410       for (k = 0; k < bs; k++) array[k] = values ? vals[k] : 0.0;
411       array += bs;
412       vals += rmax * bs;
413     }
414     l++;
415   }
416   stash->n += n;
417   space->local_used += n;
418   space->local_remaining -= n;
419   PetscFunctionReturn(0);
420 }
421 /*
422   MatStashScatterBegin_Private - Initiates the transfer of values to the
423   correct owners. This function goes through the stash, and check the
424   owners of each stashed value, and sends the values off to the owner
425   processors.
426 
427   Input Parameters:
428   stash  - the stash
429   owners - an array of size 'no-of-procs' which gives the ownership range
430            for each node.
431 
432   Note:
433     The 'owners' array in the cased of the blocked-stash has the
434   ranges specified blocked global indices, and for the regular stash in
435   the proper global indices.
436 */
437 PetscErrorCode MatStashScatterBegin_Private(Mat mat, MatStash *stash, PetscInt *owners)
438 {
439   PetscFunctionBegin;
440   PetscCall((*stash->ScatterBegin)(mat, stash, owners));
441   PetscFunctionReturn(0);
442 }
443 
444 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat, MatStash *stash, PetscInt *owners)
445 {
446   PetscInt          *owner, *startv, *starti, tag1 = stash->tag1, tag2 = stash->tag2, bs2;
447   PetscInt           size = stash->size, nsends;
448   PetscInt           count, *sindices, **rindices, i, j, idx, lastidx, l;
449   PetscScalar      **rvalues, *svalues;
450   MPI_Comm           comm = stash->comm;
451   MPI_Request       *send_waits, *recv_waits, *recv_waits1, *recv_waits2;
452   PetscMPIInt       *sizes, *nlengths, nreceives;
453   PetscInt          *sp_idx, *sp_idy;
454   PetscScalar       *sp_val;
455   PetscMatStashSpace space, space_next;
456 
457   PetscFunctionBegin;
458   { /* make sure all processors are either in INSERTMODE or ADDMODE */
459     InsertMode addv;
460     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
461     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
462     mat->insertmode = addv; /* in case this processor had no cache */
463   }
464 
465   bs2 = stash->bs * stash->bs;
466 
467   /*  first count number of contributors to each processor */
468   PetscCall(PetscCalloc1(size, &nlengths));
469   PetscCall(PetscMalloc1(stash->n + 1, &owner));
470 
471   i = j   = 0;
472   lastidx = -1;
473   space   = stash->space_head;
474   while (space) {
475     space_next = space->next;
476     sp_idx     = space->idx;
477     for (l = 0; l < space->local_used; l++) {
478       /* if indices are NOT locally sorted, need to start search at the beginning */
479       if (lastidx > (idx = sp_idx[l])) j = 0;
480       lastidx = idx;
481       for (; j < size; j++) {
482         if (idx >= owners[j] && idx < owners[j + 1]) {
483           nlengths[j]++;
484           owner[i] = j;
485           break;
486         }
487       }
488       i++;
489     }
490     space = space_next;
491   }
492 
493   /* Now check what procs get messages - and compute nsends. */
494   PetscCall(PetscCalloc1(size, &sizes));
495   for (i = 0, nsends = 0; i < size; i++) {
496     if (nlengths[i]) {
497       sizes[i] = 1;
498       nsends++;
499     }
500   }
501 
502   {
503     PetscMPIInt *onodes, *olengths;
504     /* Determine the number of messages to expect, their lengths, from from-ids */
505     PetscCall(PetscGatherNumberOfMessages(comm, sizes, nlengths, &nreceives));
506     PetscCall(PetscGatherMessageLengths(comm, nsends, nreceives, nlengths, &onodes, &olengths));
507     /* since clubbing row,col - lengths are multiplied by 2 */
508     for (i = 0; i < nreceives; i++) olengths[i] *= 2;
509     PetscCall(PetscPostIrecvInt(comm, tag1, nreceives, onodes, olengths, &rindices, &recv_waits1));
510     /* values are size 'bs2' lengths (and remove earlier factor 2 */
511     for (i = 0; i < nreceives; i++) olengths[i] = olengths[i] * bs2 / 2;
512     PetscCall(PetscPostIrecvScalar(comm, tag2, nreceives, onodes, olengths, &rvalues, &recv_waits2));
513     PetscCall(PetscFree(onodes));
514     PetscCall(PetscFree(olengths));
515   }
516 
517   /* do sends:
518       1) starts[i] gives the starting index in svalues for stuff going to
519          the ith processor
520   */
521   PetscCall(PetscMalloc2(bs2 * stash->n, &svalues, 2 * (stash->n + 1), &sindices));
522   PetscCall(PetscMalloc1(2 * nsends, &send_waits));
523   PetscCall(PetscMalloc2(size, &startv, size, &starti));
524   /* use 2 sends the first with all_a, the next with all_i and all_j */
525   startv[0] = 0;
526   starti[0] = 0;
527   for (i = 1; i < size; i++) {
528     startv[i] = startv[i - 1] + nlengths[i - 1];
529     starti[i] = starti[i - 1] + 2 * nlengths[i - 1];
530   }
531 
532   i     = 0;
533   space = stash->space_head;
534   while (space) {
535     space_next = space->next;
536     sp_idx     = space->idx;
537     sp_idy     = space->idy;
538     sp_val     = space->val;
539     for (l = 0; l < space->local_used; l++) {
540       j = owner[i];
541       if (bs2 == 1) {
542         svalues[startv[j]] = sp_val[l];
543       } else {
544         PetscInt     k;
545         PetscScalar *buf1, *buf2;
546         buf1 = svalues + bs2 * startv[j];
547         buf2 = space->val + bs2 * l;
548         for (k = 0; k < bs2; k++) buf1[k] = buf2[k];
549       }
550       sindices[starti[j]]               = sp_idx[l];
551       sindices[starti[j] + nlengths[j]] = sp_idy[l];
552       startv[j]++;
553       starti[j]++;
554       i++;
555     }
556     space = space_next;
557   }
558   startv[0] = 0;
559   for (i = 1; i < size; i++) startv[i] = startv[i - 1] + nlengths[i - 1];
560 
561   for (i = 0, count = 0; i < size; i++) {
562     if (sizes[i]) {
563       PetscCallMPI(MPI_Isend(sindices + 2 * startv[i], 2 * nlengths[i], MPIU_INT, i, tag1, comm, send_waits + count++));
564       PetscCallMPI(MPI_Isend(svalues + bs2 * startv[i], bs2 * nlengths[i], MPIU_SCALAR, i, tag2, comm, send_waits + count++));
565     }
566   }
567 #if defined(PETSC_USE_INFO)
568   PetscCall(PetscInfo(NULL, "No of messages: %" PetscInt_FMT " \n", nsends));
569   for (i = 0; i < size; i++) {
570     if (sizes[i]) PetscCall(PetscInfo(NULL, "Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n", i, (size_t)(nlengths[i] * (bs2 * sizeof(PetscScalar) + 2 * sizeof(PetscInt)))));
571   }
572 #endif
573   PetscCall(PetscFree(nlengths));
574   PetscCall(PetscFree(owner));
575   PetscCall(PetscFree2(startv, starti));
576   PetscCall(PetscFree(sizes));
577 
578   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
579   PetscCall(PetscMalloc1(2 * nreceives, &recv_waits));
580 
581   for (i = 0; i < nreceives; i++) {
582     recv_waits[2 * i]     = recv_waits1[i];
583     recv_waits[2 * i + 1] = recv_waits2[i];
584   }
585   stash->recv_waits = recv_waits;
586 
587   PetscCall(PetscFree(recv_waits1));
588   PetscCall(PetscFree(recv_waits2));
589 
590   stash->svalues         = svalues;
591   stash->sindices        = sindices;
592   stash->rvalues         = rvalues;
593   stash->rindices        = rindices;
594   stash->send_waits      = send_waits;
595   stash->nsends          = nsends;
596   stash->nrecvs          = nreceives;
597   stash->reproduce_count = 0;
598   PetscFunctionReturn(0);
599 }
600 
601 /*
602    MatStashScatterGetMesg_Private - This function waits on the receives posted
603    in the function MatStashScatterBegin_Private() and returns one message at
604    a time to the calling function. If no messages are left, it indicates this
605    by setting flg = 0, else it sets flg = 1.
606 
607    Input Parameters:
608    stash - the stash
609 
610    Output Parameters:
611    nvals - the number of entries in the current message.
612    rows  - an array of row indices (or blocked indices) corresponding to the values
613    cols  - an array of columnindices (or blocked indices) corresponding to the values
614    vals  - the values
615    flg   - 0 indicates no more message left, and the current call has no values associated.
616            1 indicates that the current call successfully received a message, and the
617              other output parameters nvals,rows,cols,vals are set appropriately.
618 */
619 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
620 {
621   PetscFunctionBegin;
622   PetscCall((*stash->ScatterGetMesg)(stash, nvals, rows, cols, vals, flg));
623   PetscFunctionReturn(0);
624 }
625 
626 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash, PetscMPIInt *nvals, PetscInt **rows, PetscInt **cols, PetscScalar **vals, PetscInt *flg)
627 {
628   PetscMPIInt i, *flg_v = stash->flg_v, i1, i2;
629   PetscInt    bs2;
630   MPI_Status  recv_status;
631   PetscBool   match_found = PETSC_FALSE;
632 
633   PetscFunctionBegin;
634   *flg = 0; /* When a message is discovered this is reset to 1 */
635   /* Return if no more messages to process */
636   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
637 
638   bs2 = stash->bs * stash->bs;
639   /* If a matching pair of receives are found, process them, and return the data to
640      the calling function. Until then keep receiving messages */
641   while (!match_found) {
642     if (stash->reproduce) {
643       i = stash->reproduce_count++;
644       PetscCallMPI(MPI_Wait(stash->recv_waits + i, &recv_status));
645     } else {
646       PetscCallMPI(MPI_Waitany(2 * stash->nrecvs, stash->recv_waits, &i, &recv_status));
647     }
648     PetscCheck(recv_status.MPI_SOURCE >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Negative MPI source!");
649 
650     /* Now pack the received message into a structure which is usable by others */
651     if (i % 2) {
652       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_SCALAR, nvals));
653       flg_v[2 * recv_status.MPI_SOURCE] = i / 2;
654       *nvals                            = *nvals / bs2;
655     } else {
656       PetscCallMPI(MPI_Get_count(&recv_status, MPIU_INT, nvals));
657       flg_v[2 * recv_status.MPI_SOURCE + 1] = i / 2;
658       *nvals                                = *nvals / 2; /* This message has both row indices and col indices */
659     }
660 
661     /* Check if we have both messages from this proc */
662     i1 = flg_v[2 * recv_status.MPI_SOURCE];
663     i2 = flg_v[2 * recv_status.MPI_SOURCE + 1];
664     if (i1 != -1 && i2 != -1) {
665       *rows = stash->rindices[i2];
666       *cols = *rows + *nvals;
667       *vals = stash->rvalues[i1];
668       *flg  = 1;
669       stash->nprocessed++;
670       match_found = PETSC_TRUE;
671     }
672   }
673   PetscFunctionReturn(0);
674 }
675 
676 #if !defined(PETSC_HAVE_MPIUNI)
677 typedef struct {
678   PetscInt    row;
679   PetscInt    col;
680   PetscScalar vals[1]; /* Actually an array of length bs2 */
681 } MatStashBlock;
682 
683 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash, InsertMode insertmode)
684 {
685   PetscMatStashSpace space;
686   PetscInt           n = stash->n, bs = stash->bs, bs2 = bs * bs, cnt, *row, *col, *perm, rowstart, i;
687   PetscScalar      **valptr;
688 
689   PetscFunctionBegin;
690   PetscCall(PetscMalloc4(n, &row, n, &col, n, &valptr, n, &perm));
691   for (space = stash->space_head, cnt = 0; space; space = space->next) {
692     for (i = 0; i < space->local_used; i++) {
693       row[cnt]    = space->idx[i];
694       col[cnt]    = space->idy[i];
695       valptr[cnt] = &space->val[i * bs2];
696       perm[cnt]   = cnt; /* Will tell us where to find valptr after sorting row[] and col[] */
697       cnt++;
698     }
699   }
700   PetscCheck(cnt == n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries", n, cnt);
701   PetscCall(PetscSortIntWithArrayPair(n, row, col, perm));
702   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
703   for (rowstart = 0, cnt = 0, i = 1; i <= n; i++) {
704     if (i == n || row[i] != row[rowstart]) { /* Sort the last row. */
705       PetscInt colstart;
706       PetscCall(PetscSortIntWithArray(i - rowstart, &col[rowstart], &perm[rowstart]));
707       for (colstart = rowstart; colstart < i;) { /* Compress multiple insertions to the same location */
708         PetscInt       j, l;
709         MatStashBlock *block;
710         PetscCall(PetscSegBufferGet(stash->segsendblocks, 1, &block));
711         block->row = row[rowstart];
712         block->col = col[colstart];
713         PetscCall(PetscArraycpy(block->vals, valptr[perm[colstart]], bs2));
714         for (j = colstart + 1; j < i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
715           if (insertmode == ADD_VALUES) {
716             for (l = 0; l < bs2; l++) block->vals[l] += valptr[perm[j]][l];
717           } else {
718             PetscCall(PetscArraycpy(block->vals, valptr[perm[j]], bs2));
719           }
720         }
721         colstart = j;
722       }
723       rowstart = i;
724     }
725   }
726   PetscCall(PetscFree4(row, col, valptr, perm));
727   PetscFunctionReturn(0);
728 }
729 
730 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
731 {
732   PetscFunctionBegin;
733   if (stash->blocktype == MPI_DATATYPE_NULL) {
734     PetscInt     bs2 = PetscSqr(stash->bs);
735     PetscMPIInt  blocklens[2];
736     MPI_Aint     displs[2];
737     MPI_Datatype types[2], stype;
738     /*
739         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
740        std::complex itself has standard layout, so does DummyBlock, recursively.
741        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
742        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
743        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
744 
745        We can test if std::complex has standard layout with the following code:
746        #include <iostream>
747        #include <type_traits>
748        #include <complex>
749        int main() {
750          std::cout << std::boolalpha;
751          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
752        }
753        Output: true
754      */
755     struct DummyBlock {
756       PetscInt    row, col;
757       PetscScalar vals;
758     };
759 
760     stash->blocktype_size = offsetof(struct DummyBlock, vals) + bs2 * sizeof(PetscScalar);
761     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
762       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
763     }
764     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segsendblocks));
765     PetscCall(PetscSegBufferCreate(stash->blocktype_size, 1, &stash->segrecvblocks));
766     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame), 1, &stash->segrecvframe));
767     blocklens[0] = 2;
768     blocklens[1] = bs2;
769     displs[0]    = offsetof(struct DummyBlock, row);
770     displs[1]    = offsetof(struct DummyBlock, vals);
771     types[0]     = MPIU_INT;
772     types[1]     = MPIU_SCALAR;
773     PetscCallMPI(MPI_Type_create_struct(2, blocklens, displs, types, &stype));
774     PetscCallMPI(MPI_Type_commit(&stype));
775     PetscCallMPI(MPI_Type_create_resized(stype, 0, stash->blocktype_size, &stash->blocktype));
776     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
777     PetscCallMPI(MPI_Type_free(&stype));
778   }
779   PetscFunctionReturn(0);
780 }
781 
782 /* Callback invoked after target rank has initiatied receive of rendezvous message.
783  * Here we post the main sends.
784  */
785 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rankid, PetscMPIInt rank, void *sdata, MPI_Request req[], void *ctx)
786 {
787   MatStash       *stash = (MatStash *)ctx;
788   MatStashHeader *hdr   = (MatStashHeader *)sdata;
789 
790   PetscFunctionBegin;
791   PetscCheck(rank == stash->sendranks[rankid], comm, PETSC_ERR_PLIB, "BTS Send rank %d does not match sendranks[%d] %d", rank, rankid, stash->sendranks[rankid]);
792   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
793   stash->sendframes[rankid].count   = hdr->count;
794   stash->sendframes[rankid].pending = 1;
795   PetscFunctionReturn(0);
796 }
797 
798 /*
799     Callback invoked by target after receiving rendezvous message.
800     Here we post the main recvs.
801  */
802 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm, const PetscMPIInt tag[], PetscMPIInt rank, void *rdata, MPI_Request req[], void *ctx)
803 {
804   MatStash       *stash = (MatStash *)ctx;
805   MatStashHeader *hdr   = (MatStashHeader *)rdata;
806   MatStashFrame  *frame;
807 
808   PetscFunctionBegin;
809   PetscCall(PetscSegBufferGet(stash->segrecvframe, 1, &frame));
810   PetscCall(PetscSegBufferGet(stash->segrecvblocks, hdr->count, &frame->buffer));
811   PetscCallMPI(MPI_Irecv(frame->buffer, hdr->count, stash->blocktype, rank, tag[0], comm, &req[0]));
812   frame->count   = hdr->count;
813   frame->pending = 1;
814   PetscFunctionReturn(0);
815 }
816 
817 /*
818  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
819  */
820 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat, MatStash *stash, PetscInt owners[])
821 {
822   size_t nblocks;
823   char  *sendblocks;
824 
825   PetscFunctionBegin;
826   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
827     InsertMode addv;
828     PetscCall(MPIU_Allreduce((PetscEnum *)&mat->insertmode, (PetscEnum *)&addv, 1, MPIU_ENUM, MPI_BOR, PetscObjectComm((PetscObject)mat)));
829     PetscCheck(addv != (ADD_VALUES | INSERT_VALUES), PetscObjectComm((PetscObject)mat), PETSC_ERR_ARG_WRONGSTATE, "Some processors inserted others added");
830   }
831 
832   PetscCall(MatStashBlockTypeSetUp(stash));
833   PetscCall(MatStashSortCompress_Private(stash, mat->insertmode));
834   PetscCall(PetscSegBufferGetSize(stash->segsendblocks, &nblocks));
835   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks, &sendblocks));
836   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
837     PetscInt i;
838     size_t   b;
839     for (i = 0, b = 0; i < stash->nsendranks; i++) {
840       stash->sendframes[i].buffer = &sendblocks[b * stash->blocktype_size];
841       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
842       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
843       for (; b < nblocks; b++) {
844         MatStashBlock *sendblock_b = (MatStashBlock *)&sendblocks[b * stash->blocktype_size];
845         PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]], stash->comm, PETSC_ERR_ARG_WRONG, "MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly", sendblock_b->row, stash->sendranks[i]);
846         if (sendblock_b->row >= owners[stash->sendranks[i] + 1]) break;
847         stash->sendhdr[i].count++;
848       }
849     }
850   } else { /* Dynamically count and pack (first time) */
851     PetscInt sendno;
852     size_t   i, rowstart;
853 
854     /* Count number of send ranks and allocate for sends */
855     stash->nsendranks = 0;
856     for (rowstart = 0; rowstart < nblocks;) {
857       PetscInt       owner;
858       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
859       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
860       if (owner < 0) owner = -(owner + 2);
861       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
862         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
863         if (sendblock_i->row >= owners[owner + 1]) break;
864       }
865       stash->nsendranks++;
866       rowstart = i;
867     }
868     PetscCall(PetscMalloc3(stash->nsendranks, &stash->sendranks, stash->nsendranks, &stash->sendhdr, stash->nsendranks, &stash->sendframes));
869 
870     /* Set up sendhdrs and sendframes */
871     sendno = 0;
872     for (rowstart = 0; rowstart < nblocks;) {
873       PetscInt       owner;
874       MatStashBlock *sendblock_rowstart = (MatStashBlock *)&sendblocks[rowstart * stash->blocktype_size];
875       PetscCall(PetscFindInt(sendblock_rowstart->row, stash->size + 1, owners, &owner));
876       if (owner < 0) owner = -(owner + 2);
877       stash->sendranks[sendno] = owner;
878       for (i = rowstart + 1; i < nblocks; i++) { /* Move forward through a run of blocks with the same owner */
879         MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
880         if (sendblock_i->row >= owners[owner + 1]) break;
881       }
882       stash->sendframes[sendno].buffer  = sendblock_rowstart;
883       stash->sendframes[sendno].pending = 0;
884       stash->sendhdr[sendno].count      = i - rowstart;
885       sendno++;
886       rowstart = i;
887     }
888     PetscCheck(sendno == stash->nsendranks, stash->comm, PETSC_ERR_PLIB, "BTS counted %d sendranks, but %" PetscInt_FMT " sends", stash->nsendranks, sendno);
889   }
890 
891   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
892    * message or a dummy entry of some sort. */
893   if (mat->insertmode == INSERT_VALUES) {
894     size_t i;
895     for (i = 0; i < nblocks; i++) {
896       MatStashBlock *sendblock_i = (MatStashBlock *)&sendblocks[i * stash->blocktype_size];
897       sendblock_i->row           = -(sendblock_i->row + 1);
898     }
899   }
900 
901   if (stash->first_assembly_done) {
902     PetscMPIInt i, tag;
903     PetscCall(PetscCommGetNewTag(stash->comm, &tag));
904     for (i = 0; i < stash->nrecvranks; i++) PetscCall(MatStashBTSRecv_Private(stash->comm, &tag, stash->recvranks[i], &stash->recvhdr[i], &stash->recvreqs[i], stash));
905     for (i = 0; i < stash->nsendranks; i++) PetscCall(MatStashBTSSend_Private(stash->comm, &tag, i, stash->sendranks[i], &stash->sendhdr[i], &stash->sendreqs[i], stash));
906     stash->use_status = PETSC_TRUE; /* Use count from message status. */
907   } else {
908     PetscCall(PetscCommBuildTwoSidedFReq(stash->comm, 1, MPIU_INT, stash->nsendranks, stash->sendranks, (PetscInt *)stash->sendhdr, &stash->nrecvranks, &stash->recvranks, (PetscInt *)&stash->recvhdr, 1, &stash->sendreqs, &stash->recvreqs, MatStashBTSSend_Private, MatStashBTSRecv_Private, stash));
909     PetscCall(PetscMalloc2(stash->nrecvranks, &stash->some_indices, stash->nrecvranks, &stash->some_statuses));
910     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
911   }
912 
913   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe, &stash->recvframes));
914   stash->recvframe_active    = NULL;
915   stash->recvframe_i         = 0;
916   stash->some_i              = 0;
917   stash->some_count          = 0;
918   stash->recvcount           = 0;
919   stash->first_assembly_done = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
920   stash->insertmode          = &mat->insertmode;
921   PetscFunctionReturn(0);
922 }
923 
924 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash, PetscMPIInt *n, PetscInt **row, PetscInt **col, PetscScalar **val, PetscInt *flg)
925 {
926   MatStashBlock *block;
927 
928   PetscFunctionBegin;
929   *flg = 0;
930   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
931     if (stash->some_i == stash->some_count) {
932       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
933       PetscCallMPI(MPI_Waitsome(stash->nrecvranks, stash->recvreqs, &stash->some_count, stash->some_indices, stash->use_status ? stash->some_statuses : MPI_STATUSES_IGNORE));
934       stash->some_i = 0;
935     }
936     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
937     stash->recvframe_count  = stash->recvframe_active->count; /* From header; maximum count */
938     if (stash->use_status) {                                  /* Count what was actually sent */
939       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i], stash->blocktype, &stash->recvframe_count));
940     }
941     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
942       block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[0];
943       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
944       PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling INSERT_VALUES, but rank %d requested ADD_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
945       PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Assembling ADD_VALUES, but rank %d requested INSERT_VALUES", stash->recvranks[stash->some_indices[stash->some_i]]);
946     }
947     stash->some_i++;
948     stash->recvcount++;
949     stash->recvframe_i = 0;
950   }
951   *n    = 1;
952   block = (MatStashBlock *)&((char *)stash->recvframe_active->buffer)[stash->recvframe_i * stash->blocktype_size];
953   if (block->row < 0) block->row = -(block->row + 1);
954   *row = &block->row;
955   *col = &block->col;
956   *val = block->vals;
957   stash->recvframe_i++;
958   *flg = 1;
959   PetscFunctionReturn(0);
960 }
961 
962 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
963 {
964   PetscFunctionBegin;
965   PetscCallMPI(MPI_Waitall(stash->nsendranks, stash->sendreqs, MPI_STATUSES_IGNORE));
966   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
967     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks, NULL));
968   } else { /* No reuse, so collect everything. */
969     PetscCall(MatStashScatterDestroy_BTS(stash));
970   }
971 
972   /* Now update nmaxold to be app 10% more than max n used, this way the
973      wastage of space is reduced the next time this stash is used.
974      Also update the oldmax, only if it increases */
975   if (stash->n) {
976     PetscInt bs2     = stash->bs * stash->bs;
977     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5) * bs2;
978     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
979   }
980 
981   stash->nmax       = 0;
982   stash->n          = 0;
983   stash->reallocs   = -1;
984   stash->nprocessed = 0;
985 
986   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
987 
988   stash->space = NULL;
989 
990   PetscFunctionReturn(0);
991 }
992 
993 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
994 {
995   PetscFunctionBegin;
996   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
997   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
998   stash->recvframes = NULL;
999   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
1000   if (stash->blocktype != MPI_DATATYPE_NULL) PetscCallMPI(MPI_Type_free(&stash->blocktype));
1001   stash->nsendranks = 0;
1002   stash->nrecvranks = 0;
1003   PetscCall(PetscFree3(stash->sendranks, stash->sendhdr, stash->sendframes));
1004   PetscCall(PetscFree(stash->sendreqs));
1005   PetscCall(PetscFree(stash->recvreqs));
1006   PetscCall(PetscFree(stash->recvranks));
1007   PetscCall(PetscFree(stash->recvhdr));
1008   PetscCall(PetscFree2(stash->some_indices, stash->some_statuses));
1009   PetscFunctionReturn(0);
1010 }
1011 #endif
1012