xref: /petsc/src/mat/utils/matstash.c (revision 7af1ef243e8a1fccbe456ce54bc4ea54b1f855d8)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE   10000
5 
6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
30 {
31   PetscInt       max,*opt,nopt,i;
32   PetscBool      flg;
33 
34   PetscFunctionBegin;
35   /* Require 2 tags,get the second using PetscCommGetNewTag() */
36   stash->comm = comm;
37 
38   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag1));
39   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag2));
40   PetscCallMPI(MPI_Comm_size(stash->comm,&stash->size));
41   PetscCallMPI(MPI_Comm_rank(stash->comm,&stash->rank));
42   PetscCall(PetscMalloc1(2*stash->size,&stash->flg_v));
43   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
44 
45   nopt = stash->size;
46   PetscCall(PetscMalloc1(nopt,&opt));
47   PetscCall(PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg));
48   if (flg) {
49     if (nopt == 1)                max = opt[0];
50     else if (nopt == stash->size) max = opt[stash->rank];
51     else if (stash->rank < nopt)  max = opt[stash->rank];
52     else                          max = 0; /* Use default */
53     stash->umax = max;
54   } else {
55     stash->umax = 0;
56   }
57   PetscCall(PetscFree(opt));
58   if (bs <= 0) bs = 1;
59 
60   stash->bs         = bs;
61   stash->nmax       = 0;
62   stash->oldnmax    = 0;
63   stash->n          = 0;
64   stash->reallocs   = -1;
65   stash->space_head = NULL;
66   stash->space      = NULL;
67 
68   stash->send_waits  = NULL;
69   stash->recv_waits  = NULL;
70   stash->send_status = NULL;
71   stash->nsends      = 0;
72   stash->nrecvs      = 0;
73   stash->svalues     = NULL;
74   stash->rvalues     = NULL;
75   stash->rindices    = NULL;
76   stash->nprocessed  = 0;
77   stash->reproduce   = PETSC_FALSE;
78   stash->blocktype   = MPI_DATATYPE_NULL;
79 
80   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL));
81 #if !defined(PETSC_HAVE_MPIUNI)
82   flg  = PETSC_FALSE;
83   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL));
84   if (!flg) {
85     stash->ScatterBegin   = MatStashScatterBegin_BTS;
86     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
87     stash->ScatterEnd     = MatStashScatterEnd_BTS;
88     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
89   } else {
90 #endif
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95 #if !defined(PETSC_HAVE_MPIUNI)
96   }
97 #endif
98   PetscFunctionReturn(0);
99 }
100 
101 /*
102    MatStashDestroy_Private - Destroy the stash
103 */
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscFunctionBegin;
107   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
108   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
109 
110   stash->space = NULL;
111 
112   PetscCall(PetscFree(stash->flg_v));
113   PetscFunctionReturn(0);
114 }
115 
116 /*
117    MatStashScatterEnd_Private - This is called as the final stage of
118    scatter. The final stages of message passing is done here, and
119    all the memory used for message passing is cleaned up. This
120    routine also resets the stash, and deallocates the memory used
121    for the stash. It also keeps track of the current memory usage
122    so that the same value can be used the next time through.
123 */
124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
125 {
126   PetscFunctionBegin;
127   PetscCall((*stash->ScatterEnd)(stash));
128   PetscFunctionReturn(0);
129 }
130 
131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
132 {
133   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
134   MPI_Status     *send_status;
135 
136   PetscFunctionBegin;
137   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
138   /* wait on sends */
139   if (nsends) {
140     PetscCall(PetscMalloc1(2*nsends,&send_status));
141     PetscCallMPI(MPI_Waitall(2*nsends,stash->send_waits,send_status));
142     PetscCall(PetscFree(send_status));
143   }
144 
145   /* Now update nmaxold to be app 10% more than max n used, this way the
146      wastage of space is reduced the next time this stash is used.
147      Also update the oldmax, only if it increases */
148   if (stash->n) {
149     bs2     = stash->bs*stash->bs;
150     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
151     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
152   }
153 
154   stash->nmax       = 0;
155   stash->n          = 0;
156   stash->reallocs   = -1;
157   stash->nprocessed = 0;
158 
159   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
160 
161   stash->space = NULL;
162 
163   PetscCall(PetscFree(stash->send_waits));
164   PetscCall(PetscFree(stash->recv_waits));
165   PetscCall(PetscFree2(stash->svalues,stash->sindices));
166   PetscCall(PetscFree(stash->rvalues[0]));
167   PetscCall(PetscFree(stash->rvalues));
168   PetscCall(PetscFree(stash->rindices[0]));
169   PetscCall(PetscFree(stash->rindices));
170   PetscFunctionReturn(0);
171 }
172 
173 /*
174    MatStashGetInfo_Private - Gets the relavant statistics of the stash
175 
176    Input Parameters:
177    stash    - the stash
178    nstash   - the size of the stash. Indicates the number of values stored.
179    reallocs - the number of additional mallocs incurred.
180 
181 */
182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
183 {
184   PetscInt bs2 = stash->bs*stash->bs;
185 
186   PetscFunctionBegin;
187   if (nstash) *nstash = stash->n*bs2;
188   if (reallocs) {
189     if (stash->reallocs < 0) *reallocs = 0;
190     else                     *reallocs = stash->reallocs;
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*
196    MatStashSetInitialSize_Private - Sets the initial size of the stash
197 
198    Input Parameters:
199    stash  - the stash
200    max    - the value that is used as the max size of the stash.
201             this value is used while allocating memory.
202 */
203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
204 {
205   PetscFunctionBegin;
206   stash->umax = max;
207   PetscFunctionReturn(0);
208 }
209 
210 /* MatStashExpand_Private - Expand the stash. This function is called
211    when the space in the stash is not sufficient to add the new values
212    being inserted into the stash.
213 
214    Input Parameters:
215    stash - the stash
216    incr  - the minimum increase requested
217 
218    Notes:
219    This routine doubles the currently used memory.
220  */
221 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
222 {
223   PetscInt newnmax,bs2= stash->bs*stash->bs;
224 
225   PetscFunctionBegin;
226   /* allocate a larger stash */
227   if (!stash->oldnmax && !stash->nmax) { /* new stash */
228     if (stash->umax)                  newnmax = stash->umax/bs2;
229     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
230   } else if (!stash->nmax) { /* resuing stash */
231     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
232     else                              newnmax = stash->oldnmax/bs2;
233   } else                              newnmax = stash->nmax*2;
234   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;
235 
236   /* Get a MatStashSpace and attach it to stash */
237   PetscCall(PetscMatStashSpaceGet(bs2,newnmax,&stash->space));
238   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
239     stash->space_head = stash->space;
240   }
241 
242   stash->reallocs++;
243   stash->nmax = newnmax;
244   PetscFunctionReturn(0);
245 }
246 /*
247   MatStashValuesRow_Private - inserts values into the stash. This function
248   expects the values to be roworiented. Multiple columns belong to the same row
249   can be inserted with a single call to this function.
250 
251   Input Parameters:
252   stash  - the stash
253   row    - the global row correspoiding to the values
254   n      - the number of elements inserted. All elements belong to the above row.
255   idxn   - the global column indices corresponding to each of the values.
256   values - the values inserted
257 */
258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
259 {
260   PetscInt           i,k,cnt = 0;
261   PetscMatStashSpace space=stash->space;
262 
263   PetscFunctionBegin;
264   /* Check and see if we have sufficient memory */
265   if (!space || space->local_remaining < n) {
266     PetscCall(MatStashExpand_Private(stash,n));
267   }
268   space = stash->space;
269   k     = space->local_used;
270   for (i=0; i<n; i++) {
271     if (ignorezeroentries && values && values[i] == 0.0) continue;
272     space->idx[k] = row;
273     space->idy[k] = idxn[i];
274     space->val[k] = values ? values[i] : 0.0;
275     k++;
276     cnt++;
277   }
278   stash->n               += cnt;
279   space->local_used      += cnt;
280   space->local_remaining -= cnt;
281   PetscFunctionReturn(0);
282 }
283 
284 /*
285   MatStashValuesCol_Private - inserts values into the stash. This function
286   expects the values to be columnoriented. Multiple columns belong to the same row
287   can be inserted with a single call to this function.
288 
289   Input Parameters:
290   stash   - the stash
291   row     - the global row correspoiding to the values
292   n       - the number of elements inserted. All elements belong to the above row.
293   idxn    - the global column indices corresponding to each of the values.
294   values  - the values inserted
295   stepval - the consecutive values are sepated by a distance of stepval.
296             this happens because the input is columnoriented.
297 */
298 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
299 {
300   PetscInt           i,k,cnt = 0;
301   PetscMatStashSpace space=stash->space;
302 
303   PetscFunctionBegin;
304   /* Check and see if we have sufficient memory */
305   if (!space || space->local_remaining < n) {
306     PetscCall(MatStashExpand_Private(stash,n));
307   }
308   space = stash->space;
309   k     = space->local_used;
310   for (i=0; i<n; i++) {
311     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
312     space->idx[k] = row;
313     space->idy[k] = idxn[i];
314     space->val[k] = values ? values[i*stepval] : 0.0;
315     k++;
316     cnt++;
317   }
318   stash->n               += cnt;
319   space->local_used      += cnt;
320   space->local_remaining -= cnt;
321   PetscFunctionReturn(0);
322 }
323 
324 /*
325   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
326   This function expects the values to be roworiented. Multiple columns belong
327   to the same block-row can be inserted with a single call to this function.
328   This function extracts the sub-block of values based on the dimensions of
329   the original input block, and the row,col values corresponding to the blocks.
330 
331   Input Parameters:
332   stash  - the stash
333   row    - the global block-row correspoiding to the values
334   n      - the number of elements inserted. All elements belong to the above row.
335   idxn   - the global block-column indices corresponding to each of the blocks of
336            values. Each block is of size bs*bs.
337   values - the values inserted
338   rmax   - the number of block-rows in the original block.
339   cmax   - the number of block-columns on the original block.
340   idx    - the index of the current block-row in the original block.
341 */
342 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
343 {
344   PetscInt           i,j,k,bs2,bs=stash->bs,l;
345   const PetscScalar  *vals;
346   PetscScalar        *array;
347   PetscMatStashSpace space=stash->space;
348 
349   PetscFunctionBegin;
350   if (!space || space->local_remaining < n) {
351     PetscCall(MatStashExpand_Private(stash,n));
352   }
353   space = stash->space;
354   l     = space->local_used;
355   bs2   = bs*bs;
356   for (i=0; i<n; i++) {
357     space->idx[l] = row;
358     space->idy[l] = idxn[i];
359     /* Now copy over the block of values. Store the values column oriented.
360        This enables inserting multiple blocks belonging to a row with a single
361        funtion call */
362     array = space->val + bs2*l;
363     vals  = values + idx*bs2*n + bs*i;
364     for (j=0; j<bs; j++) {
365       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
366       array++;
367       vals += cmax*bs;
368     }
369     l++;
370   }
371   stash->n               += n;
372   space->local_used      += n;
373   space->local_remaining -= n;
374   PetscFunctionReturn(0);
375 }
376 
377 /*
378   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
379   This function expects the values to be roworiented. Multiple columns belong
380   to the same block-row can be inserted with a single call to this function.
381   This function extracts the sub-block of values based on the dimensions of
382   the original input block, and the row,col values corresponding to the blocks.
383 
384   Input Parameters:
385   stash  - the stash
386   row    - the global block-row correspoiding to the values
387   n      - the number of elements inserted. All elements belong to the above row.
388   idxn   - the global block-column indices corresponding to each of the blocks of
389            values. Each block is of size bs*bs.
390   values - the values inserted
391   rmax   - the number of block-rows in the original block.
392   cmax   - the number of block-columns on the original block.
393   idx    - the index of the current block-row in the original block.
394 */
395 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
396 {
397   PetscInt           i,j,k,bs2,bs=stash->bs,l;
398   const PetscScalar  *vals;
399   PetscScalar        *array;
400   PetscMatStashSpace space=stash->space;
401 
402   PetscFunctionBegin;
403   if (!space || space->local_remaining < n) {
404     PetscCall(MatStashExpand_Private(stash,n));
405   }
406   space = stash->space;
407   l     = space->local_used;
408   bs2   = bs*bs;
409   for (i=0; i<n; i++) {
410     space->idx[l] = row;
411     space->idy[l] = idxn[i];
412     /* Now copy over the block of values. Store the values column oriented.
413      This enables inserting multiple blocks belonging to a row with a single
414      funtion call */
415     array = space->val + bs2*l;
416     vals  = values + idx*bs2*n + bs*i;
417     for (j=0; j<bs; j++) {
418       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
419       array += bs;
420       vals  += rmax*bs;
421     }
422     l++;
423   }
424   stash->n               += n;
425   space->local_used      += n;
426   space->local_remaining -= n;
427   PetscFunctionReturn(0);
428 }
429 /*
430   MatStashScatterBegin_Private - Initiates the transfer of values to the
431   correct owners. This function goes through the stash, and check the
432   owners of each stashed value, and sends the values off to the owner
433   processors.
434 
435   Input Parameters:
436   stash  - the stash
437   owners - an array of size 'no-of-procs' which gives the ownership range
438            for each node.
439 
440   Notes:
441     The 'owners' array in the cased of the blocked-stash has the
442   ranges specified blocked global indices, and for the regular stash in
443   the proper global indices.
444 */
445 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
446 {
447   PetscFunctionBegin;
448   PetscCall((*stash->ScatterBegin)(mat,stash,owners));
449   PetscFunctionReturn(0);
450 }
451 
452 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
453 {
454   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
455   PetscInt           size=stash->size,nsends;
456   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
457   PetscScalar        **rvalues,*svalues;
458   MPI_Comm           comm = stash->comm;
459   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
460   PetscMPIInt        *sizes,*nlengths,nreceives;
461   PetscInt           *sp_idx,*sp_idy;
462   PetscScalar        *sp_val;
463   PetscMatStashSpace space,space_next;
464 
465   PetscFunctionBegin;
466   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
467     InsertMode addv;
468     PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
469     PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
470     mat->insertmode = addv; /* in case this processor had no cache */
471   }
472 
473   bs2 = stash->bs*stash->bs;
474 
475   /*  first count number of contributors to each processor */
476   PetscCall(PetscCalloc1(size,&nlengths));
477   PetscCall(PetscMalloc1(stash->n+1,&owner));
478 
479   i       = j    = 0;
480   lastidx = -1;
481   space   = stash->space_head;
482   while (space) {
483     space_next = space->next;
484     sp_idx     = space->idx;
485     for (l=0; l<space->local_used; l++) {
486       /* if indices are NOT locally sorted, need to start search at the beginning */
487       if (lastidx > (idx = sp_idx[l])) j = 0;
488       lastidx = idx;
489       for (; j<size; j++) {
490         if (idx >= owners[j] && idx < owners[j+1]) {
491           nlengths[j]++; owner[i] = j; break;
492         }
493       }
494       i++;
495     }
496     space = space_next;
497   }
498 
499   /* Now check what procs get messages - and compute nsends. */
500   PetscCall(PetscCalloc1(size,&sizes));
501   for (i=0, nsends=0; i<size; i++) {
502     if (nlengths[i]) {
503       sizes[i] = 1; nsends++;
504     }
505   }
506 
507   {PetscMPIInt *onodes,*olengths;
508    /* Determine the number of messages to expect, their lengths, from from-ids */
509    PetscCall(PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives));
510    PetscCall(PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths));
511    /* since clubbing row,col - lengths are multiplied by 2 */
512    for (i=0; i<nreceives; i++) olengths[i] *=2;
513    PetscCall(PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1));
514    /* values are size 'bs2' lengths (and remove earlier factor 2 */
515    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
516    PetscCall(PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2));
517    PetscCall(PetscFree(onodes));
518    PetscCall(PetscFree(olengths));}
519 
520   /* do sends:
521       1) starts[i] gives the starting index in svalues for stuff going to
522          the ith processor
523   */
524   PetscCall(PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices));
525   PetscCall(PetscMalloc1(2*nsends,&send_waits));
526   PetscCall(PetscMalloc2(size,&startv,size,&starti));
527   /* use 2 sends the first with all_a, the next with all_i and all_j */
528   startv[0] = 0; starti[0] = 0;
529   for (i=1; i<size; i++) {
530     startv[i] = startv[i-1] + nlengths[i-1];
531     starti[i] = starti[i-1] + 2*nlengths[i-1];
532   }
533 
534   i     = 0;
535   space = stash->space_head;
536   while (space) {
537     space_next = space->next;
538     sp_idx     = space->idx;
539     sp_idy     = space->idy;
540     sp_val     = space->val;
541     for (l=0; l<space->local_used; l++) {
542       j = owner[i];
543       if (bs2 == 1) {
544         svalues[startv[j]] = sp_val[l];
545       } else {
546         PetscInt    k;
547         PetscScalar *buf1,*buf2;
548         buf1 = svalues+bs2*startv[j];
549         buf2 = space->val + bs2*l;
550         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
551       }
552       sindices[starti[j]]             = sp_idx[l];
553       sindices[starti[j]+nlengths[j]] = sp_idy[l];
554       startv[j]++;
555       starti[j]++;
556       i++;
557     }
558     space = space_next;
559   }
560   startv[0] = 0;
561   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
562 
563   for (i=0,count=0; i<size; i++) {
564     if (sizes[i]) {
565       PetscCallMPI(MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++));
566       PetscCallMPI(MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++));
567     }
568   }
569 #if defined(PETSC_USE_INFO)
570   PetscCall(PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends));
571   for (i=0; i<size; i++) {
572     if (sizes[i]) {
573       PetscCall(PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)))));
574     }
575   }
576 #endif
577   PetscCall(PetscFree(nlengths));
578   PetscCall(PetscFree(owner));
579   PetscCall(PetscFree2(startv,starti));
580   PetscCall(PetscFree(sizes));
581 
582   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
583   PetscCall(PetscMalloc1(2*nreceives,&recv_waits));
584 
585   for (i=0; i<nreceives; i++) {
586     recv_waits[2*i]   = recv_waits1[i];
587     recv_waits[2*i+1] = recv_waits2[i];
588   }
589   stash->recv_waits = recv_waits;
590 
591   PetscCall(PetscFree(recv_waits1));
592   PetscCall(PetscFree(recv_waits2));
593 
594   stash->svalues         = svalues;
595   stash->sindices        = sindices;
596   stash->rvalues         = rvalues;
597   stash->rindices        = rindices;
598   stash->send_waits      = send_waits;
599   stash->nsends          = nsends;
600   stash->nrecvs          = nreceives;
601   stash->reproduce_count = 0;
602   PetscFunctionReturn(0);
603 }
604 
605 /*
606    MatStashScatterGetMesg_Private - This function waits on the receives posted
607    in the function MatStashScatterBegin_Private() and returns one message at
608    a time to the calling function. If no messages are left, it indicates this
609    by setting flg = 0, else it sets flg = 1.
610 
611    Input Parameters:
612    stash - the stash
613 
614    Output Parameters:
615    nvals - the number of entries in the current message.
616    rows  - an array of row indices (or blocked indices) corresponding to the values
617    cols  - an array of columnindices (or blocked indices) corresponding to the values
618    vals  - the values
619    flg   - 0 indicates no more message left, and the current call has no values associated.
620            1 indicates that the current call successfully received a message, and the
621              other output parameters nvals,rows,cols,vals are set appropriately.
622 */
623 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
624 {
625   PetscFunctionBegin;
626   PetscCall((*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg));
627   PetscFunctionReturn(0);
628 }
629 
630 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
631 {
632   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
633   PetscInt       bs2;
634   MPI_Status     recv_status;
635   PetscBool      match_found = PETSC_FALSE;
636 
637   PetscFunctionBegin;
638   *flg = 0; /* When a message is discovered this is reset to 1 */
639   /* Return if no more messages to process */
640   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
641 
642   bs2 = stash->bs*stash->bs;
643   /* If a matching pair of receives are found, process them, and return the data to
644      the calling function. Until then keep receiving messages */
645   while (!match_found) {
646     if (stash->reproduce) {
647       i    = stash->reproduce_count++;
648       PetscCallMPI(MPI_Wait(stash->recv_waits+i,&recv_status));
649     } else {
650       PetscCallMPI(MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status));
651     }
652     PetscCheck(recv_status.MPI_SOURCE >= 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");
653 
654     /* Now pack the received message into a structure which is usable by others */
655     if (i % 2) {
656       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_SCALAR,nvals));
657       flg_v[2*recv_status.MPI_SOURCE] = i/2;
658       *nvals = *nvals/bs2;
659     } else {
660       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_INT,nvals));
661       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
662       *nvals = *nvals/2; /* This message has both row indices and col indices */
663     }
664 
665     /* Check if we have both messages from this proc */
666     i1 = flg_v[2*recv_status.MPI_SOURCE];
667     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
668     if (i1 != -1 && i2 != -1) {
669       *rows = stash->rindices[i2];
670       *cols = *rows + *nvals;
671       *vals = stash->rvalues[i1];
672       *flg  = 1;
673       stash->nprocessed++;
674       match_found = PETSC_TRUE;
675     }
676   }
677   PetscFunctionReturn(0);
678 }
679 
680 #if !defined(PETSC_HAVE_MPIUNI)
681 typedef struct {
682   PetscInt    row;
683   PetscInt    col;
684   PetscScalar vals[1];          /* Actually an array of length bs2 */
685 } MatStashBlock;
686 
687 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
688 {
689   PetscMatStashSpace space;
690   PetscInt           n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
691   PetscScalar        **valptr;
692 
693   PetscFunctionBegin;
694   PetscCall(PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm));
695   for (space=stash->space_head,cnt=0; space; space=space->next) {
696     for (i=0; i<space->local_used; i++) {
697       row[cnt] = space->idx[i];
698       col[cnt] = space->idy[i];
699       valptr[cnt] = &space->val[i*bs2];
700       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
701       cnt++;
702     }
703   }
704   PetscCheck(cnt == n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries",n,cnt);
705   PetscCall(PetscSortIntWithArrayPair(n,row,col,perm));
706   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
707   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
708     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
709       PetscInt colstart;
710       PetscCall(PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]));
711       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
712         PetscInt j,l;
713         MatStashBlock *block;
714         PetscCall(PetscSegBufferGet(stash->segsendblocks,1,&block));
715         block->row = row[rowstart];
716         block->col = col[colstart];
717         PetscCall(PetscArraycpy(block->vals,valptr[perm[colstart]],bs2));
718         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
719           if (insertmode == ADD_VALUES) {
720             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
721           } else {
722             PetscCall(PetscArraycpy(block->vals,valptr[perm[j]],bs2));
723           }
724         }
725         colstart = j;
726       }
727       rowstart = i;
728     }
729   }
730   PetscCall(PetscFree4(row,col,valptr,perm));
731   PetscFunctionReturn(0);
732 }
733 
734 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
735 {
736   PetscFunctionBegin;
737   if (stash->blocktype == MPI_DATATYPE_NULL) {
738     PetscInt     bs2 = PetscSqr(stash->bs);
739     PetscMPIInt  blocklens[2];
740     MPI_Aint     displs[2];
741     MPI_Datatype types[2],stype;
742     /*
743         DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
744        std::complex itself has standard layout, so does DummyBlock, recursively.
745        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
746        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
747        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
748 
749        We can test if std::complex has standard layout with the following code:
750        #include <iostream>
751        #include <type_traits>
752        #include <complex>
753        int main() {
754          std::cout << std::boolalpha;
755          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
756        }
757        Output: true
758      */
759     struct DummyBlock {PetscInt row,col; PetscScalar vals;};
760 
761     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
762     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
763       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
764     }
765     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks));
766     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks));
767     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe));
768     blocklens[0] = 2;
769     blocklens[1] = bs2;
770     displs[0] = offsetof(struct DummyBlock,row);
771     displs[1] = offsetof(struct DummyBlock,vals);
772     types[0] = MPIU_INT;
773     types[1] = MPIU_SCALAR;
774     PetscCallMPI(MPI_Type_create_struct(2,blocklens,displs,types,&stype));
775     PetscCallMPI(MPI_Type_commit(&stype));
776     PetscCallMPI(MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype));
777     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
778     PetscCallMPI(MPI_Type_free(&stype));
779   }
780   PetscFunctionReturn(0);
781 }
782 
783 /* Callback invoked after target rank has initiatied receive of rendezvous message.
784  * Here we post the main sends.
785  */
786 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
787 {
788   MatStash       *stash = (MatStash*)ctx;
789   MatStashHeader *hdr = (MatStashHeader*)sdata;
790 
791   PetscFunctionBegin;
792   PetscCheck(rank == stash->sendranks[rankid],comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
793   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
794   stash->sendframes[rankid].count = hdr->count;
795   stash->sendframes[rankid].pending = 1;
796   PetscFunctionReturn(0);
797 }
798 
799 /*
800     Callback invoked by target after receiving rendezvous message.
801     Here we post the main recvs.
802  */
803 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
804 {
805   MatStash       *stash = (MatStash*)ctx;
806   MatStashHeader *hdr = (MatStashHeader*)rdata;
807   MatStashFrame  *frame;
808 
809   PetscFunctionBegin;
810   PetscCall(PetscSegBufferGet(stash->segrecvframe,1,&frame));
811   PetscCall(PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer));
812   PetscCallMPI(MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
813   frame->count = hdr->count;
814   frame->pending = 1;
815   PetscFunctionReturn(0);
816 }
817 
818 /*
819  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
820  */
821 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
822 {
823   PetscErrorCode ierr;
824   size_t         nblocks;
825   char           *sendblocks;
826 
827   PetscFunctionBegin;
828   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
829     InsertMode addv;
830     PetscCall(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
831     PetscCheck(addv != (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
832   }
833 
834   PetscCall(MatStashBlockTypeSetUp(stash));
835   PetscCall(MatStashSortCompress_Private(stash,mat->insertmode));
836   PetscCall(PetscSegBufferGetSize(stash->segsendblocks,&nblocks));
837   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks));
838   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
839     PetscInt i;
840     size_t b;
841     for (i=0,b=0; i<stash->nsendranks; i++) {
842       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
843       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
844       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
845       for (; b<nblocks; b++) {
846         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
847         PetscCheckFalse(sendblock_b->row < owners[stash->sendranks[i]],stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
848         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
849         stash->sendhdr[i].count++;
850       }
851     }
852   } else {                      /* Dynamically count and pack (first time) */
853     PetscInt sendno;
854     size_t i,rowstart;
855 
856     /* Count number of send ranks and allocate for sends */
857     stash->nsendranks = 0;
858     for (rowstart=0; rowstart<nblocks;) {
859       PetscInt owner;
860       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
861       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
862       if (owner < 0) owner = -(owner+2);
863       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
864         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
865         if (sendblock_i->row >= owners[owner+1]) break;
866       }
867       stash->nsendranks++;
868       rowstart = i;
869     }
870     PetscCall(PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes));
871 
872     /* Set up sendhdrs and sendframes */
873     sendno = 0;
874     for (rowstart=0; rowstart<nblocks;) {
875       PetscInt owner;
876       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
877       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
878       if (owner < 0) owner = -(owner+2);
879       stash->sendranks[sendno] = owner;
880       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
881         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
882         if (sendblock_i->row >= owners[owner+1]) break;
883       }
884       stash->sendframes[sendno].buffer = sendblock_rowstart;
885       stash->sendframes[sendno].pending = 0;
886       stash->sendhdr[sendno].count = i - rowstart;
887       sendno++;
888       rowstart = i;
889     }
890     PetscCheck(sendno == stash->nsendranks,stash->comm,PETSC_ERR_PLIB,"BTS counted %d sendranks, but %" PetscInt_FMT " sends",stash->nsendranks,sendno);
891   }
892 
893   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
894    * message or a dummy entry of some sort. */
895   if (mat->insertmode == INSERT_VALUES) {
896     size_t i;
897     for (i=0; i<nblocks; i++) {
898       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
899       sendblock_i->row = -(sendblock_i->row+1);
900     }
901   }
902 
903   if (stash->first_assembly_done) {
904     PetscMPIInt i,tag;
905     PetscCall(PetscCommGetNewTag(stash->comm,&tag));
906     for (i=0; i<stash->nrecvranks; i++) {
907       PetscCall(MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash));
908     }
909     for (i=0; i<stash->nsendranks; i++) {
910       PetscCall(MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash));
911     }
912     stash->use_status = PETSC_TRUE; /* Use count from message status. */
913   } else {
914     ierr = PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
915                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
916                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);PetscCall(ierr);
917     PetscCall(PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses));
918     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
919   }
920 
921   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes));
922   stash->recvframe_active     = NULL;
923   stash->recvframe_i          = 0;
924   stash->some_i               = 0;
925   stash->some_count           = 0;
926   stash->recvcount            = 0;
927   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
928   stash->insertmode           = &mat->insertmode;
929   PetscFunctionReturn(0);
930 }
931 
932 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
933 {
934   MatStashBlock *block;
935 
936   PetscFunctionBegin;
937   *flg = 0;
938   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
939     if (stash->some_i == stash->some_count) {
940       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
941       PetscCallMPI(MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE));
942       stash->some_i = 0;
943     }
944     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
945     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
946     if (stash->use_status) { /* Count what was actually sent */
947       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count));
948     }
949     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
950       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
951       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
952       PetscCheckFalse(*stash->insertmode == INSERT_VALUES && block->row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
953       PetscCheckFalse(*stash->insertmode == ADD_VALUES && block->row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
954     }
955     stash->some_i++;
956     stash->recvcount++;
957     stash->recvframe_i = 0;
958   }
959   *n = 1;
960   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
961   if (block->row < 0) block->row = -(block->row + 1);
962   *row = &block->row;
963   *col = &block->col;
964   *val = block->vals;
965   stash->recvframe_i++;
966   *flg = 1;
967   PetscFunctionReturn(0);
968 }
969 
970 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
971 {
972   PetscFunctionBegin;
973   PetscCallMPI(MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE));
974   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
975     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks,NULL));
976   } else {                      /* No reuse, so collect everything. */
977     PetscCall(MatStashScatterDestroy_BTS(stash));
978   }
979 
980   /* Now update nmaxold to be app 10% more than max n used, this way the
981      wastage of space is reduced the next time this stash is used.
982      Also update the oldmax, only if it increases */
983   if (stash->n) {
984     PetscInt bs2     = stash->bs*stash->bs;
985     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
986     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
987   }
988 
989   stash->nmax       = 0;
990   stash->n          = 0;
991   stash->reallocs   = -1;
992   stash->nprocessed = 0;
993 
994   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
995 
996   stash->space = NULL;
997 
998   PetscFunctionReturn(0);
999 }
1000 
1001 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1002 {
1003   PetscFunctionBegin;
1004   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
1005   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
1006   stash->recvframes = NULL;
1007   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
1008   if (stash->blocktype != MPI_DATATYPE_NULL) {
1009     PetscCallMPI(MPI_Type_free(&stash->blocktype));
1010   }
1011   stash->nsendranks = 0;
1012   stash->nrecvranks = 0;
1013   PetscCall(PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes));
1014   PetscCall(PetscFree(stash->sendreqs));
1015   PetscCall(PetscFree(stash->recvreqs));
1016   PetscCall(PetscFree(stash->recvranks));
1017   PetscCall(PetscFree(stash->recvhdr));
1018   PetscCall(PetscFree2(stash->some_indices,stash->some_statuses));
1019   PetscFunctionReturn(0);
1020 }
1021 #endif
1022