xref: /petsc/src/mat/utils/matstash.c (revision ebead697dbf761eb322f829370bbe90b3bd93fa3)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE   10000
5 
6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
30 {
31   PetscInt       max,*opt,nopt,i;
32   PetscBool      flg;
33 
34   PetscFunctionBegin;
35   /* Require 2 tags,get the second using PetscCommGetNewTag() */
36   stash->comm = comm;
37 
38   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag1));
39   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag2));
40   PetscCallMPI(MPI_Comm_size(stash->comm,&stash->size));
41   PetscCallMPI(MPI_Comm_rank(stash->comm,&stash->rank));
42   PetscCall(PetscMalloc1(2*stash->size,&stash->flg_v));
43   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
44 
45   nopt = stash->size;
46   PetscCall(PetscMalloc1(nopt,&opt));
47   PetscCall(PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg));
48   if (flg) {
49     if (nopt == 1)                max = opt[0];
50     else if (nopt == stash->size) max = opt[stash->rank];
51     else if (stash->rank < nopt)  max = opt[stash->rank];
52     else                          max = 0; /* Use default */
53     stash->umax = max;
54   } else {
55     stash->umax = 0;
56   }
57   PetscCall(PetscFree(opt));
58   if (bs <= 0) bs = 1;
59 
60   stash->bs         = bs;
61   stash->nmax       = 0;
62   stash->oldnmax    = 0;
63   stash->n          = 0;
64   stash->reallocs   = -1;
65   stash->space_head = NULL;
66   stash->space      = NULL;
67 
68   stash->send_waits  = NULL;
69   stash->recv_waits  = NULL;
70   stash->send_status = NULL;
71   stash->nsends      = 0;
72   stash->nrecvs      = 0;
73   stash->svalues     = NULL;
74   stash->rvalues     = NULL;
75   stash->rindices    = NULL;
76   stash->nprocessed  = 0;
77   stash->reproduce   = PETSC_FALSE;
78   stash->blocktype   = MPI_DATATYPE_NULL;
79 
80   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL));
81 #if !defined(PETSC_HAVE_MPIUNI)
82   flg  = PETSC_FALSE;
83   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL));
84   if (!flg) {
85     stash->ScatterBegin   = MatStashScatterBegin_BTS;
86     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
87     stash->ScatterEnd     = MatStashScatterEnd_BTS;
88     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
89   } else {
90 #endif
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95 #if !defined(PETSC_HAVE_MPIUNI)
96   }
97 #endif
98   PetscFunctionReturn(0);
99 }
100 
101 /*
102    MatStashDestroy_Private - Destroy the stash
103 */
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscFunctionBegin;
107   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
108   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
109 
110   stash->space = NULL;
111 
112   PetscCall(PetscFree(stash->flg_v));
113   PetscFunctionReturn(0);
114 }
115 
116 /*
117    MatStashScatterEnd_Private - This is called as the final stage of
118    scatter. The final stages of message passing is done here, and
119    all the memory used for message passing is cleaned up. This
120    routine also resets the stash, and deallocates the memory used
121    for the stash. It also keeps track of the current memory usage
122    so that the same value can be used the next time through.
123 */
124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
125 {
126   PetscFunctionBegin;
127   PetscCall((*stash->ScatterEnd)(stash));
128   PetscFunctionReturn(0);
129 }
130 
131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
132 {
133   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
134   MPI_Status     *send_status;
135 
136   PetscFunctionBegin;
137   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
138   /* wait on sends */
139   if (nsends) {
140     PetscCall(PetscMalloc1(2*nsends,&send_status));
141     PetscCallMPI(MPI_Waitall(2*nsends,stash->send_waits,send_status));
142     PetscCall(PetscFree(send_status));
143   }
144 
145   /* Now update nmaxold to be app 10% more than max n used, this way the
146      wastage of space is reduced the next time this stash is used.
147      Also update the oldmax, only if it increases */
148   if (stash->n) {
149     bs2     = stash->bs*stash->bs;
150     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
151     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
152   }
153 
154   stash->nmax       = 0;
155   stash->n          = 0;
156   stash->reallocs   = -1;
157   stash->nprocessed = 0;
158 
159   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
160 
161   stash->space = NULL;
162 
163   PetscCall(PetscFree(stash->send_waits));
164   PetscCall(PetscFree(stash->recv_waits));
165   PetscCall(PetscFree2(stash->svalues,stash->sindices));
166   PetscCall(PetscFree(stash->rvalues[0]));
167   PetscCall(PetscFree(stash->rvalues));
168   PetscCall(PetscFree(stash->rindices[0]));
169   PetscCall(PetscFree(stash->rindices));
170   PetscFunctionReturn(0);
171 }
172 
173 /*
174    MatStashGetInfo_Private - Gets the relevant statistics of the stash
175 
176    Input Parameters:
177    stash    - the stash
178    nstash   - the size of the stash. Indicates the number of values stored.
179    reallocs - the number of additional mallocs incurred.
180 
181 */
182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
183 {
184   PetscInt bs2 = stash->bs*stash->bs;
185 
186   PetscFunctionBegin;
187   if (nstash) *nstash = stash->n*bs2;
188   if (reallocs) {
189     if (stash->reallocs < 0) *reallocs = 0;
190     else                     *reallocs = stash->reallocs;
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*
196    MatStashSetInitialSize_Private - Sets the initial size of the stash
197 
198    Input Parameters:
199    stash  - the stash
200    max    - the value that is used as the max size of the stash.
201             this value is used while allocating memory.
202 */
203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
204 {
205   PetscFunctionBegin;
206   stash->umax = max;
207   PetscFunctionReturn(0);
208 }
209 
210 /* MatStashExpand_Private - Expand the stash. This function is called
211    when the space in the stash is not sufficient to add the new values
212    being inserted into the stash.
213 
214    Input Parameters:
215    stash - the stash
216    incr  - the minimum increase requested
217 
218    Notes:
219    This routine doubles the currently used memory.
220  */
221 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
222 {
223   PetscInt newnmax,bs2= stash->bs*stash->bs;
224 
225   PetscFunctionBegin;
226   /* allocate a larger stash */
227   if (!stash->oldnmax && !stash->nmax) { /* new stash */
228     if (stash->umax)                  newnmax = stash->umax/bs2;
229     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
230   } else if (!stash->nmax) { /* resuing stash */
231     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
232     else                              newnmax = stash->oldnmax/bs2;
233   } else                              newnmax = stash->nmax*2;
234   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;
235 
236   /* Get a MatStashSpace and attach it to stash */
237   PetscCall(PetscMatStashSpaceGet(bs2,newnmax,&stash->space));
238   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
239     stash->space_head = stash->space;
240   }
241 
242   stash->reallocs++;
243   stash->nmax = newnmax;
244   PetscFunctionReturn(0);
245 }
246 /*
247   MatStashValuesRow_Private - inserts values into the stash. This function
248   expects the values to be roworiented. Multiple columns belong to the same row
249   can be inserted with a single call to this function.
250 
251   Input Parameters:
252   stash  - the stash
253   row    - the global row correspoiding to the values
254   n      - the number of elements inserted. All elements belong to the above row.
255   idxn   - the global column indices corresponding to each of the values.
256   values - the values inserted
257 */
258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
259 {
260   PetscInt           i,k,cnt = 0;
261   PetscMatStashSpace space=stash->space;
262 
263   PetscFunctionBegin;
264   /* Check and see if we have sufficient memory */
265   if (!space || space->local_remaining < n) {
266     PetscCall(MatStashExpand_Private(stash,n));
267   }
268   space = stash->space;
269   k     = space->local_used;
270   for (i=0; i<n; i++) {
271     if (ignorezeroentries && values && values[i] == 0.0) continue;
272     space->idx[k] = row;
273     space->idy[k] = idxn[i];
274     space->val[k] = values ? values[i] : 0.0;
275     k++;
276     cnt++;
277   }
278   stash->n               += cnt;
279   space->local_used      += cnt;
280   space->local_remaining -= cnt;
281   PetscFunctionReturn(0);
282 }
283 
284 /*
285   MatStashValuesCol_Private - inserts values into the stash. This function
286   expects the values to be columnoriented. Multiple columns belong to the same row
287   can be inserted with a single call to this function.
288 
289   Input Parameters:
290   stash   - the stash
291   row     - the global row correspoiding to the values
292   n       - the number of elements inserted. All elements belong to the above row.
293   idxn    - the global column indices corresponding to each of the values.
294   values  - the values inserted
295   stepval - the consecutive values are sepated by a distance of stepval.
296             this happens because the input is columnoriented.
297 */
298 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
299 {
300   PetscInt           i,k,cnt = 0;
301   PetscMatStashSpace space=stash->space;
302 
303   PetscFunctionBegin;
304   /* Check and see if we have sufficient memory */
305   if (!space || space->local_remaining < n) {
306     PetscCall(MatStashExpand_Private(stash,n));
307   }
308   space = stash->space;
309   k     = space->local_used;
310   for (i=0; i<n; i++) {
311     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
312     space->idx[k] = row;
313     space->idy[k] = idxn[i];
314     space->val[k] = values ? values[i*stepval] : 0.0;
315     k++;
316     cnt++;
317   }
318   stash->n               += cnt;
319   space->local_used      += cnt;
320   space->local_remaining -= cnt;
321   PetscFunctionReturn(0);
322 }
323 
324 /*
325   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
326   This function expects the values to be roworiented. Multiple columns belong
327   to the same block-row can be inserted with a single call to this function.
328   This function extracts the sub-block of values based on the dimensions of
329   the original input block, and the row,col values corresponding to the blocks.
330 
331   Input Parameters:
332   stash  - the stash
333   row    - the global block-row correspoiding to the values
334   n      - the number of elements inserted. All elements belong to the above row.
335   idxn   - the global block-column indices corresponding to each of the blocks of
336            values. Each block is of size bs*bs.
337   values - the values inserted
338   rmax   - the number of block-rows in the original block.
339   cmax   - the number of block-columns on the original block.
340   idx    - the index of the current block-row in the original block.
341 */
342 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
343 {
344   PetscInt           i,j,k,bs2,bs=stash->bs,l;
345   const PetscScalar  *vals;
346   PetscScalar        *array;
347   PetscMatStashSpace space=stash->space;
348 
349   PetscFunctionBegin;
350   if (!space || space->local_remaining < n) {
351     PetscCall(MatStashExpand_Private(stash,n));
352   }
353   space = stash->space;
354   l     = space->local_used;
355   bs2   = bs*bs;
356   for (i=0; i<n; i++) {
357     space->idx[l] = row;
358     space->idy[l] = idxn[i];
359     /* Now copy over the block of values. Store the values column oriented.
360        This enables inserting multiple blocks belonging to a row with a single
361        funtion call */
362     array = space->val + bs2*l;
363     vals  = values + idx*bs2*n + bs*i;
364     for (j=0; j<bs; j++) {
365       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
366       array++;
367       vals += cmax*bs;
368     }
369     l++;
370   }
371   stash->n               += n;
372   space->local_used      += n;
373   space->local_remaining -= n;
374   PetscFunctionReturn(0);
375 }
376 
377 /*
378   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
379   This function expects the values to be roworiented. Multiple columns belong
380   to the same block-row can be inserted with a single call to this function.
381   This function extracts the sub-block of values based on the dimensions of
382   the original input block, and the row,col values corresponding to the blocks.
383 
384   Input Parameters:
385   stash  - the stash
386   row    - the global block-row correspoiding to the values
387   n      - the number of elements inserted. All elements belong to the above row.
388   idxn   - the global block-column indices corresponding to each of the blocks of
389            values. Each block is of size bs*bs.
390   values - the values inserted
391   rmax   - the number of block-rows in the original block.
392   cmax   - the number of block-columns on the original block.
393   idx    - the index of the current block-row in the original block.
394 */
395 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
396 {
397   PetscInt           i,j,k,bs2,bs=stash->bs,l;
398   const PetscScalar  *vals;
399   PetscScalar        *array;
400   PetscMatStashSpace space=stash->space;
401 
402   PetscFunctionBegin;
403   if (!space || space->local_remaining < n) {
404     PetscCall(MatStashExpand_Private(stash,n));
405   }
406   space = stash->space;
407   l     = space->local_used;
408   bs2   = bs*bs;
409   for (i=0; i<n; i++) {
410     space->idx[l] = row;
411     space->idy[l] = idxn[i];
412     /* Now copy over the block of values. Store the values column oriented.
413      This enables inserting multiple blocks belonging to a row with a single
414      funtion call */
415     array = space->val + bs2*l;
416     vals  = values + idx*bs2*n + bs*i;
417     for (j=0; j<bs; j++) {
418       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
419       array += bs;
420       vals  += rmax*bs;
421     }
422     l++;
423   }
424   stash->n               += n;
425   space->local_used      += n;
426   space->local_remaining -= n;
427   PetscFunctionReturn(0);
428 }
429 /*
430   MatStashScatterBegin_Private - Initiates the transfer of values to the
431   correct owners. This function goes through the stash, and check the
432   owners of each stashed value, and sends the values off to the owner
433   processors.
434 
435   Input Parameters:
436   stash  - the stash
437   owners - an array of size 'no-of-procs' which gives the ownership range
438            for each node.
439 
440   Notes:
441     The 'owners' array in the cased of the blocked-stash has the
442   ranges specified blocked global indices, and for the regular stash in
443   the proper global indices.
444 */
445 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
446 {
447   PetscFunctionBegin;
448   PetscCall((*stash->ScatterBegin)(mat,stash,owners));
449   PetscFunctionReturn(0);
450 }
451 
452 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
453 {
454   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
455   PetscInt           size=stash->size,nsends;
456   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
457   PetscScalar        **rvalues,*svalues;
458   MPI_Comm           comm = stash->comm;
459   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
460   PetscMPIInt        *sizes,*nlengths,nreceives;
461   PetscInt           *sp_idx,*sp_idy;
462   PetscScalar        *sp_val;
463   PetscMatStashSpace space,space_next;
464 
465   PetscFunctionBegin;
466   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
467     InsertMode addv;
468     PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
469     PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
470     mat->insertmode = addv; /* in case this processor had no cache */
471   }
472 
473   bs2 = stash->bs*stash->bs;
474 
475   /*  first count number of contributors to each processor */
476   PetscCall(PetscCalloc1(size,&nlengths));
477   PetscCall(PetscMalloc1(stash->n+1,&owner));
478 
479   i       = j    = 0;
480   lastidx = -1;
481   space   = stash->space_head;
482   while (space) {
483     space_next = space->next;
484     sp_idx     = space->idx;
485     for (l=0; l<space->local_used; l++) {
486       /* if indices are NOT locally sorted, need to start search at the beginning */
487       if (lastidx > (idx = sp_idx[l])) j = 0;
488       lastidx = idx;
489       for (; j<size; j++) {
490         if (idx >= owners[j] && idx < owners[j+1]) {
491           nlengths[j]++; owner[i] = j; break;
492         }
493       }
494       i++;
495     }
496     space = space_next;
497   }
498 
499   /* Now check what procs get messages - and compute nsends. */
500   PetscCall(PetscCalloc1(size,&sizes));
501   for (i=0, nsends=0; i<size; i++) {
502     if (nlengths[i]) {
503       sizes[i] = 1; nsends++;
504     }
505   }
506 
507   {PetscMPIInt *onodes,*olengths;
508    /* Determine the number of messages to expect, their lengths, from from-ids */
509    PetscCall(PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives));
510    PetscCall(PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths));
511    /* since clubbing row,col - lengths are multiplied by 2 */
512    for (i=0; i<nreceives; i++) olengths[i] *=2;
513    PetscCall(PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1));
514    /* values are size 'bs2' lengths (and remove earlier factor 2 */
515    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
516    PetscCall(PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2));
517    PetscCall(PetscFree(onodes));
518    PetscCall(PetscFree(olengths));}
519 
520   /* do sends:
521       1) starts[i] gives the starting index in svalues for stuff going to
522          the ith processor
523   */
524   PetscCall(PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices));
525   PetscCall(PetscMalloc1(2*nsends,&send_waits));
526   PetscCall(PetscMalloc2(size,&startv,size,&starti));
527   /* use 2 sends the first with all_a, the next with all_i and all_j */
528   startv[0] = 0; starti[0] = 0;
529   for (i=1; i<size; i++) {
530     startv[i] = startv[i-1] + nlengths[i-1];
531     starti[i] = starti[i-1] + 2*nlengths[i-1];
532   }
533 
534   i     = 0;
535   space = stash->space_head;
536   while (space) {
537     space_next = space->next;
538     sp_idx     = space->idx;
539     sp_idy     = space->idy;
540     sp_val     = space->val;
541     for (l=0; l<space->local_used; l++) {
542       j = owner[i];
543       if (bs2 == 1) {
544         svalues[startv[j]] = sp_val[l];
545       } else {
546         PetscInt    k;
547         PetscScalar *buf1,*buf2;
548         buf1 = svalues+bs2*startv[j];
549         buf2 = space->val + bs2*l;
550         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
551       }
552       sindices[starti[j]]             = sp_idx[l];
553       sindices[starti[j]+nlengths[j]] = sp_idy[l];
554       startv[j]++;
555       starti[j]++;
556       i++;
557     }
558     space = space_next;
559   }
560   startv[0] = 0;
561   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
562 
563   for (i=0,count=0; i<size; i++) {
564     if (sizes[i]) {
565       PetscCallMPI(MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++));
566       PetscCallMPI(MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++));
567     }
568   }
569 #if defined(PETSC_USE_INFO)
570   PetscCall(PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends));
571   for (i=0; i<size; i++) {
572     if (sizes[i]) {
573       PetscCall(PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)))));
574     }
575   }
576 #endif
577   PetscCall(PetscFree(nlengths));
578   PetscCall(PetscFree(owner));
579   PetscCall(PetscFree2(startv,starti));
580   PetscCall(PetscFree(sizes));
581 
582   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
583   PetscCall(PetscMalloc1(2*nreceives,&recv_waits));
584 
585   for (i=0; i<nreceives; i++) {
586     recv_waits[2*i]   = recv_waits1[i];
587     recv_waits[2*i+1] = recv_waits2[i];
588   }
589   stash->recv_waits = recv_waits;
590 
591   PetscCall(PetscFree(recv_waits1));
592   PetscCall(PetscFree(recv_waits2));
593 
594   stash->svalues         = svalues;
595   stash->sindices        = sindices;
596   stash->rvalues         = rvalues;
597   stash->rindices        = rindices;
598   stash->send_waits      = send_waits;
599   stash->nsends          = nsends;
600   stash->nrecvs          = nreceives;
601   stash->reproduce_count = 0;
602   PetscFunctionReturn(0);
603 }
604 
605 /*
606    MatStashScatterGetMesg_Private - This function waits on the receives posted
607    in the function MatStashScatterBegin_Private() and returns one message at
608    a time to the calling function. If no messages are left, it indicates this
609    by setting flg = 0, else it sets flg = 1.
610 
611    Input Parameters:
612    stash - the stash
613 
614    Output Parameters:
615    nvals - the number of entries in the current message.
616    rows  - an array of row indices (or blocked indices) corresponding to the values
617    cols  - an array of columnindices (or blocked indices) corresponding to the values
618    vals  - the values
619    flg   - 0 indicates no more message left, and the current call has no values associated.
620            1 indicates that the current call successfully received a message, and the
621              other output parameters nvals,rows,cols,vals are set appropriately.
622 */
623 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
624 {
625   PetscFunctionBegin;
626   PetscCall((*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg));
627   PetscFunctionReturn(0);
628 }
629 
630 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
631 {
632   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
633   PetscInt       bs2;
634   MPI_Status     recv_status;
635   PetscBool      match_found = PETSC_FALSE;
636 
637   PetscFunctionBegin;
638   *flg = 0; /* When a message is discovered this is reset to 1 */
639   /* Return if no more messages to process */
640   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
641 
642   bs2 = stash->bs*stash->bs;
643   /* If a matching pair of receives are found, process them, and return the data to
644      the calling function. Until then keep receiving messages */
645   while (!match_found) {
646     if (stash->reproduce) {
647       i    = stash->reproduce_count++;
648       PetscCallMPI(MPI_Wait(stash->recv_waits+i,&recv_status));
649     } else {
650       PetscCallMPI(MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status));
651     }
652     PetscCheck(recv_status.MPI_SOURCE >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");
653 
654     /* Now pack the received message into a structure which is usable by others */
655     if (i % 2) {
656       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_SCALAR,nvals));
657       flg_v[2*recv_status.MPI_SOURCE] = i/2;
658       *nvals = *nvals/bs2;
659     } else {
660       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_INT,nvals));
661       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
662       *nvals = *nvals/2; /* This message has both row indices and col indices */
663     }
664 
665     /* Check if we have both messages from this proc */
666     i1 = flg_v[2*recv_status.MPI_SOURCE];
667     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
668     if (i1 != -1 && i2 != -1) {
669       *rows = stash->rindices[i2];
670       *cols = *rows + *nvals;
671       *vals = stash->rvalues[i1];
672       *flg  = 1;
673       stash->nprocessed++;
674       match_found = PETSC_TRUE;
675     }
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 #if !defined(PETSC_HAVE_MPIUNI)
681 typedef struct {
682   PetscInt    row;
683   PetscInt    col;
684   PetscScalar vals[1];          /* Actually an array of length bs2 */
685 } MatStashBlock;
686 
687 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
688 {
689   PetscMatStashSpace space;
690   PetscInt           n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
691   PetscScalar        **valptr;
692 
693   PetscFunctionBegin;
694   PetscCall(PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm));
695   for (space=stash->space_head,cnt=0; space; space=space->next) {
696     for (i=0; i<space->local_used; i++) {
697       row[cnt] = space->idx[i];
698       col[cnt] = space->idy[i];
699       valptr[cnt] = &space->val[i*bs2];
700       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
701       cnt++;
702     }
703   }
704   PetscCheck(cnt == n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries",n,cnt);
705   PetscCall(PetscSortIntWithArrayPair(n,row,col,perm));
706   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
707   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
708     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
709       PetscInt colstart;
710       PetscCall(PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]));
711       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
712         PetscInt j,l;
713         MatStashBlock *block;
714         PetscCall(PetscSegBufferGet(stash->segsendblocks,1,&block));
715         block->row = row[rowstart];
716         block->col = col[colstart];
717         PetscCall(PetscArraycpy(block->vals,valptr[perm[colstart]],bs2));
718         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
719           if (insertmode == ADD_VALUES) {
720             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
721           } else {
722             PetscCall(PetscArraycpy(block->vals,valptr[perm[j]],bs2));
723           }
724         }
725         colstart = j;
726       }
727       rowstart = i;
728     }
729   }
730   PetscCall(PetscFree4(row,col,valptr,perm));
731   PetscFunctionReturn(0);
732 }
733 
734 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
735 {
736   PetscFunctionBegin;
737   if (stash->blocktype == MPI_DATATYPE_NULL) {
738     PetscInt     bs2 = PetscSqr(stash->bs);
739     PetscMPIInt  blocklens[2];
740     MPI_Aint     displs[2];
741     MPI_Datatype types[2],stype;
742     /*
743         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
744        std::complex itself has standard layout, so does DummyBlock, recursively.
745        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
746        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
747        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
748 
749        We can test if std::complex has standard layout with the following code:
750        #include <iostream>
751        #include <type_traits>
752        #include <complex>
753        int main() {
754          std::cout << std::boolalpha;
755          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
756        }
757        Output: true
758      */
759     struct DummyBlock {PetscInt row,col; PetscScalar vals;};
760 
761     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
762     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
763       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
764     }
765     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks));
766     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks));
767     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe));
768     blocklens[0] = 2;
769     blocklens[1] = bs2;
770     displs[0] = offsetof(struct DummyBlock,row);
771     displs[1] = offsetof(struct DummyBlock,vals);
772     types[0] = MPIU_INT;
773     types[1] = MPIU_SCALAR;
774     PetscCallMPI(MPI_Type_create_struct(2,blocklens,displs,types,&stype));
775     PetscCallMPI(MPI_Type_commit(&stype));
776     PetscCallMPI(MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype));
777     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
778     PetscCallMPI(MPI_Type_free(&stype));
779   }
780   PetscFunctionReturn(0);
781 }
782 
783 /* Callback invoked after target rank has initiatied receive of rendezvous message.
784  * Here we post the main sends.
785  */
786 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
787 {
788   MatStash       *stash = (MatStash*)ctx;
789   MatStashHeader *hdr = (MatStashHeader*)sdata;
790 
791   PetscFunctionBegin;
792   PetscCheck(rank == stash->sendranks[rankid],comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
793   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
794   stash->sendframes[rankid].count = hdr->count;
795   stash->sendframes[rankid].pending = 1;
796   PetscFunctionReturn(0);
797 }
798 
799 /*
800     Callback invoked by target after receiving rendezvous message.
801     Here we post the main recvs.
802  */
803 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
804 {
805   MatStash       *stash = (MatStash*)ctx;
806   MatStashHeader *hdr = (MatStashHeader*)rdata;
807   MatStashFrame  *frame;
808 
809   PetscFunctionBegin;
810   PetscCall(PetscSegBufferGet(stash->segrecvframe,1,&frame));
811   PetscCall(PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer));
812   PetscCallMPI(MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
813   frame->count = hdr->count;
814   frame->pending = 1;
815   PetscFunctionReturn(0);
816 }
817 
818 /*
819  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
820  */
821 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
822 {
823   size_t         nblocks;
824   char           *sendblocks;
825 
826   PetscFunctionBegin;
827   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
828     InsertMode addv;
829     PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
830     PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
831   }
832 
833   PetscCall(MatStashBlockTypeSetUp(stash));
834   PetscCall(MatStashSortCompress_Private(stash,mat->insertmode));
835   PetscCall(PetscSegBufferGetSize(stash->segsendblocks,&nblocks));
836   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks));
837   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
838     PetscInt i;
839     size_t b;
840     for (i=0,b=0; i<stash->nsendranks; i++) {
841       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
842       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
843       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
844       for (; b<nblocks; b++) {
845         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
846         PetscCheck(sendblock_b->row >= owners[stash->sendranks[i]],stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
847         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
848         stash->sendhdr[i].count++;
849       }
850     }
851   } else {                      /* Dynamically count and pack (first time) */
852     PetscInt sendno;
853     size_t i,rowstart;
854 
855     /* Count number of send ranks and allocate for sends */
856     stash->nsendranks = 0;
857     for (rowstart=0; rowstart<nblocks;) {
858       PetscInt owner;
859       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
860       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
861       if (owner < 0) owner = -(owner+2);
862       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
863         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
864         if (sendblock_i->row >= owners[owner+1]) break;
865       }
866       stash->nsendranks++;
867       rowstart = i;
868     }
869     PetscCall(PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes));
870 
871     /* Set up sendhdrs and sendframes */
872     sendno = 0;
873     for (rowstart=0; rowstart<nblocks;) {
874       PetscInt owner;
875       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
876       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
877       if (owner < 0) owner = -(owner+2);
878       stash->sendranks[sendno] = owner;
879       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
880         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
881         if (sendblock_i->row >= owners[owner+1]) break;
882       }
883       stash->sendframes[sendno].buffer = sendblock_rowstart;
884       stash->sendframes[sendno].pending = 0;
885       stash->sendhdr[sendno].count = i - rowstart;
886       sendno++;
887       rowstart = i;
888     }
889     PetscCheck(sendno == stash->nsendranks,stash->comm,PETSC_ERR_PLIB,"BTS counted %d sendranks, but %" PetscInt_FMT " sends",stash->nsendranks,sendno);
890   }
891 
892   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
893    * message or a dummy entry of some sort. */
894   if (mat->insertmode == INSERT_VALUES) {
895     size_t i;
896     for (i=0; i<nblocks; i++) {
897       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
898       sendblock_i->row = -(sendblock_i->row+1);
899     }
900   }
901 
902   if (stash->first_assembly_done) {
903     PetscMPIInt i,tag;
904     PetscCall(PetscCommGetNewTag(stash->comm,&tag));
905     for (i=0; i<stash->nrecvranks; i++) {
906       PetscCall(MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash));
907     }
908     for (i=0; i<stash->nsendranks; i++) {
909       PetscCall(MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash));
910     }
911     stash->use_status = PETSC_TRUE; /* Use count from message status. */
912   } else {
913     PetscCall(PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
914                                          &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
915                                          MatStashBTSSend_Private,MatStashBTSRecv_Private,stash));
916     PetscCall(PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses));
917     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
918   }
919 
920   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes));
921   stash->recvframe_active     = NULL;
922   stash->recvframe_i          = 0;
923   stash->some_i               = 0;
924   stash->some_count           = 0;
925   stash->recvcount            = 0;
926   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
927   stash->insertmode           = &mat->insertmode;
928   PetscFunctionReturn(0);
929 }
930 
931 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
932 {
933   MatStashBlock *block;
934 
935   PetscFunctionBegin;
936   *flg = 0;
937   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
938     if (stash->some_i == stash->some_count) {
939       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
940       PetscCallMPI(MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE));
941       stash->some_i = 0;
942     }
943     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
944     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
945     if (stash->use_status) { /* Count what was actually sent */
946       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count));
947     }
948     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
949       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
950       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
951       PetscCheck(*stash->insertmode != INSERT_VALUES || block->row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
952       PetscCheck(*stash->insertmode != ADD_VALUES || block->row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
953     }
954     stash->some_i++;
955     stash->recvcount++;
956     stash->recvframe_i = 0;
957   }
958   *n = 1;
959   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
960   if (block->row < 0) block->row = -(block->row + 1);
961   *row = &block->row;
962   *col = &block->col;
963   *val = block->vals;
964   stash->recvframe_i++;
965   *flg = 1;
966   PetscFunctionReturn(0);
967 }
968 
969 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
970 {
971   PetscFunctionBegin;
972   PetscCallMPI(MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE));
973   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
974     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks,NULL));
975   } else {                      /* No reuse, so collect everything. */
976     PetscCall(MatStashScatterDestroy_BTS(stash));
977   }
978 
979   /* Now update nmaxold to be app 10% more than max n used, this way the
980      wastage of space is reduced the next time this stash is used.
981      Also update the oldmax, only if it increases */
982   if (stash->n) {
983     PetscInt bs2     = stash->bs*stash->bs;
984     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
985     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
986   }
987 
988   stash->nmax       = 0;
989   stash->n          = 0;
990   stash->reallocs   = -1;
991   stash->nprocessed = 0;
992 
993   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
994 
995   stash->space = NULL;
996 
997   PetscFunctionReturn(0);
998 }
999 
1000 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1001 {
1002   PetscFunctionBegin;
1003   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
1004   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
1005   stash->recvframes = NULL;
1006   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
1007   if (stash->blocktype != MPI_DATATYPE_NULL) {
1008     PetscCallMPI(MPI_Type_free(&stash->blocktype));
1009   }
1010   stash->nsendranks = 0;
1011   stash->nrecvranks = 0;
1012   PetscCall(PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes));
1013   PetscCall(PetscFree(stash->sendreqs));
1014   PetscCall(PetscFree(stash->recvreqs));
1015   PetscCall(PetscFree(stash->recvranks));
1016   PetscCall(PetscFree(stash->recvhdr));
1017   PetscCall(PetscFree2(stash->some_indices,stash->some_statuses));
1018   PetscFunctionReturn(0);
1019 }
1020 #endif
1021