xref: /petsc/src/mat/utils/matstash.c (revision 4e278199b78715991f5c71ebbd945c1489263e6c)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE   10000
5 
6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
30 {
31   PetscErrorCode ierr;
32   PetscInt       max,*opt,nopt,i;
33   PetscBool      flg;
34 
35   PetscFunctionBegin;
36   /* Require 2 tags,get the second using PetscCommGetNewTag() */
37   stash->comm = comm;
38 
39   ierr = PetscCommGetNewTag(stash->comm,&stash->tag1);CHKERRQ(ierr);
40   ierr = PetscCommGetNewTag(stash->comm,&stash->tag2);CHKERRQ(ierr);
41   ierr = MPI_Comm_size(stash->comm,&stash->size);CHKERRMPI(ierr);
42   ierr = MPI_Comm_rank(stash->comm,&stash->rank);CHKERRMPI(ierr);
43   ierr = PetscMalloc1(2*stash->size,&stash->flg_v);CHKERRQ(ierr);
44   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
45 
46   nopt = stash->size;
47   ierr = PetscMalloc1(nopt,&opt);CHKERRQ(ierr);
48   ierr = PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg);CHKERRQ(ierr);
49   if (flg) {
50     if (nopt == 1)                max = opt[0];
51     else if (nopt == stash->size) max = opt[stash->rank];
52     else if (stash->rank < nopt)  max = opt[stash->rank];
53     else                          max = 0; /* Use default */
54     stash->umax = max;
55   } else {
56     stash->umax = 0;
57   }
58   ierr = PetscFree(opt);CHKERRQ(ierr);
59   if (bs <= 0) bs = 1;
60 
61   stash->bs         = bs;
62   stash->nmax       = 0;
63   stash->oldnmax    = 0;
64   stash->n          = 0;
65   stash->reallocs   = -1;
66   stash->space_head = NULL;
67   stash->space      = NULL;
68 
69   stash->send_waits  = NULL;
70   stash->recv_waits  = NULL;
71   stash->send_status = NULL;
72   stash->nsends      = 0;
73   stash->nrecvs      = 0;
74   stash->svalues     = NULL;
75   stash->rvalues     = NULL;
76   stash->rindices    = NULL;
77   stash->nprocessed  = 0;
78   stash->reproduce   = PETSC_FALSE;
79   stash->blocktype   = MPI_DATATYPE_NULL;
80 
81   ierr = PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL);CHKERRQ(ierr);
82 #if !defined(PETSC_HAVE_MPIUNI)
83   flg  = PETSC_FALSE;
84   ierr = PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL);CHKERRQ(ierr);
85   if (!flg) {
86     stash->ScatterBegin   = MatStashScatterBegin_BTS;
87     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
88     stash->ScatterEnd     = MatStashScatterEnd_BTS;
89     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
90   } else {
91 #endif
92     stash->ScatterBegin   = MatStashScatterBegin_Ref;
93     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
94     stash->ScatterEnd     = MatStashScatterEnd_Ref;
95     stash->ScatterDestroy = NULL;
96 #if !defined(PETSC_HAVE_MPIUNI)
97   }
98 #endif
99   PetscFunctionReturn(0);
100 }
101 
102 /*
103    MatStashDestroy_Private - Destroy the stash
104 */
105 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
106 {
107   PetscErrorCode ierr;
108 
109   PetscFunctionBegin;
110   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
111   if (stash->ScatterDestroy) {ierr = (*stash->ScatterDestroy)(stash);CHKERRQ(ierr);}
112 
113   stash->space = NULL;
114 
115   ierr = PetscFree(stash->flg_v);CHKERRQ(ierr);
116   PetscFunctionReturn(0);
117 }
118 
119 /*
120    MatStashScatterEnd_Private - This is called as the final stage of
121    scatter. The final stages of message passing is done here, and
122    all the memory used for message passing is cleaned up. This
123    routine also resets the stash, and deallocates the memory used
124    for the stash. It also keeps track of the current memory usage
125    so that the same value can be used the next time through.
126 */
127 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
128 {
129   PetscErrorCode ierr;
130 
131   PetscFunctionBegin;
132   ierr = (*stash->ScatterEnd)(stash);CHKERRQ(ierr);
133   PetscFunctionReturn(0);
134 }
135 
136 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
137 {
138   PetscErrorCode ierr;
139   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
140   MPI_Status     *send_status;
141 
142   PetscFunctionBegin;
143   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
144   /* wait on sends */
145   if (nsends) {
146     ierr = PetscMalloc1(2*nsends,&send_status);CHKERRQ(ierr);
147     ierr = MPI_Waitall(2*nsends,stash->send_waits,send_status);CHKERRMPI(ierr);
148     ierr = PetscFree(send_status);CHKERRQ(ierr);
149   }
150 
151   /* Now update nmaxold to be app 10% more than max n used, this way the
152      wastage of space is reduced the next time this stash is used.
153      Also update the oldmax, only if it increases */
154   if (stash->n) {
155     bs2     = stash->bs*stash->bs;
156     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
157     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
158   }
159 
160   stash->nmax       = 0;
161   stash->n          = 0;
162   stash->reallocs   = -1;
163   stash->nprocessed = 0;
164 
165   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
166 
167   stash->space = NULL;
168 
169   ierr = PetscFree(stash->send_waits);CHKERRQ(ierr);
170   ierr = PetscFree(stash->recv_waits);CHKERRQ(ierr);
171   ierr = PetscFree2(stash->svalues,stash->sindices);CHKERRQ(ierr);
172   ierr = PetscFree(stash->rvalues[0]);CHKERRQ(ierr);
173   ierr = PetscFree(stash->rvalues);CHKERRQ(ierr);
174   ierr = PetscFree(stash->rindices[0]);CHKERRQ(ierr);
175   ierr = PetscFree(stash->rindices);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 /*
180    MatStashGetInfo_Private - Gets the relavant statistics of the stash
181 
182    Input Parameters:
183    stash    - the stash
184    nstash   - the size of the stash. Indicates the number of values stored.
185    reallocs - the number of additional mallocs incurred.
186 
187 */
188 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
189 {
190   PetscInt bs2 = stash->bs*stash->bs;
191 
192   PetscFunctionBegin;
193   if (nstash) *nstash = stash->n*bs2;
194   if (reallocs) {
195     if (stash->reallocs < 0) *reallocs = 0;
196     else                     *reallocs = stash->reallocs;
197   }
198   PetscFunctionReturn(0);
199 }
200 
201 /*
202    MatStashSetInitialSize_Private - Sets the initial size of the stash
203 
204    Input Parameters:
205    stash  - the stash
206    max    - the value that is used as the max size of the stash.
207             this value is used while allocating memory.
208 */
209 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
210 {
211   PetscFunctionBegin;
212   stash->umax = max;
213   PetscFunctionReturn(0);
214 }
215 
216 /* MatStashExpand_Private - Expand the stash. This function is called
217    when the space in the stash is not sufficient to add the new values
218    being inserted into the stash.
219 
220    Input Parameters:
221    stash - the stash
222    incr  - the minimum increase requested
223 
224    Notes:
225    This routine doubles the currently used memory.
226  */
227 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
228 {
229   PetscErrorCode ierr;
230   PetscInt       newnmax,bs2= stash->bs*stash->bs;
231 
232   PetscFunctionBegin;
233   /* allocate a larger stash */
234   if (!stash->oldnmax && !stash->nmax) { /* new stash */
235     if (stash->umax)                  newnmax = stash->umax/bs2;
236     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
237   } else if (!stash->nmax) { /* resuing stash */
238     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
239     else                              newnmax = stash->oldnmax/bs2;
240   } else                              newnmax = stash->nmax*2;
241   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;
242 
243   /* Get a MatStashSpace and attach it to stash */
244   ierr = PetscMatStashSpaceGet(bs2,newnmax,&stash->space);CHKERRQ(ierr);
245   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
246     stash->space_head = stash->space;
247   }
248 
249   stash->reallocs++;
250   stash->nmax = newnmax;
251   PetscFunctionReturn(0);
252 }
253 /*
254   MatStashValuesRow_Private - inserts values into the stash. This function
255   expects the values to be roworiented. Multiple columns belong to the same row
256   can be inserted with a single call to this function.
257 
258   Input Parameters:
259   stash  - the stash
260   row    - the global row correspoiding to the values
261   n      - the number of elements inserted. All elements belong to the above row.
262   idxn   - the global column indices corresponding to each of the values.
263   values - the values inserted
264 */
265 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
266 {
267   PetscErrorCode     ierr;
268   PetscInt           i,k,cnt = 0;
269   PetscMatStashSpace space=stash->space;
270 
271   PetscFunctionBegin;
272   /* Check and see if we have sufficient memory */
273   if (!space || space->local_remaining < n) {
274     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
275   }
276   space = stash->space;
277   k     = space->local_used;
278   for (i=0; i<n; i++) {
279     if (ignorezeroentries && values && values[i] == 0.0) continue;
280     space->idx[k] = row;
281     space->idy[k] = idxn[i];
282     space->val[k] = values ? values[i] : 0.0;
283     k++;
284     cnt++;
285   }
286   stash->n               += cnt;
287   space->local_used      += cnt;
288   space->local_remaining -= cnt;
289   PetscFunctionReturn(0);
290 }
291 
292 /*
293   MatStashValuesCol_Private - inserts values into the stash. This function
294   expects the values to be columnoriented. Multiple columns belong to the same row
295   can be inserted with a single call to this function.
296 
297   Input Parameters:
298   stash   - the stash
299   row     - the global row correspoiding to the values
300   n       - the number of elements inserted. All elements belong to the above row.
301   idxn    - the global column indices corresponding to each of the values.
302   values  - the values inserted
303   stepval - the consecutive values are sepated by a distance of stepval.
304             this happens because the input is columnoriented.
305 */
306 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
307 {
308   PetscErrorCode     ierr;
309   PetscInt           i,k,cnt = 0;
310   PetscMatStashSpace space=stash->space;
311 
312   PetscFunctionBegin;
313   /* Check and see if we have sufficient memory */
314   if (!space || space->local_remaining < n) {
315     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
316   }
317   space = stash->space;
318   k     = space->local_used;
319   for (i=0; i<n; i++) {
320     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
321     space->idx[k] = row;
322     space->idy[k] = idxn[i];
323     space->val[k] = values ? values[i*stepval] : 0.0;
324     k++;
325     cnt++;
326   }
327   stash->n               += cnt;
328   space->local_used      += cnt;
329   space->local_remaining -= cnt;
330   PetscFunctionReturn(0);
331 }
332 
333 /*
334   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
335   This function expects the values to be roworiented. Multiple columns belong
336   to the same block-row can be inserted with a single call to this function.
337   This function extracts the sub-block of values based on the dimensions of
338   the original input block, and the row,col values corresponding to the blocks.
339 
340   Input Parameters:
341   stash  - the stash
342   row    - the global block-row correspoiding to the values
343   n      - the number of elements inserted. All elements belong to the above row.
344   idxn   - the global block-column indices corresponding to each of the blocks of
345            values. Each block is of size bs*bs.
346   values - the values inserted
347   rmax   - the number of block-rows in the original block.
348   cmax   - the number of block-columsn on the original block.
349   idx    - the index of the current block-row in the original block.
350 */
351 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
352 {
353   PetscErrorCode     ierr;
354   PetscInt           i,j,k,bs2,bs=stash->bs,l;
355   const PetscScalar  *vals;
356   PetscScalar        *array;
357   PetscMatStashSpace space=stash->space;
358 
359   PetscFunctionBegin;
360   if (!space || space->local_remaining < n) {
361     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
362   }
363   space = stash->space;
364   l     = space->local_used;
365   bs2   = bs*bs;
366   for (i=0; i<n; i++) {
367     space->idx[l] = row;
368     space->idy[l] = idxn[i];
369     /* Now copy over the block of values. Store the values column oriented.
370        This enables inserting multiple blocks belonging to a row with a single
371        funtion call */
372     array = space->val + bs2*l;
373     vals  = values + idx*bs2*n + bs*i;
374     for (j=0; j<bs; j++) {
375       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
376       array++;
377       vals += cmax*bs;
378     }
379     l++;
380   }
381   stash->n               += n;
382   space->local_used      += n;
383   space->local_remaining -= n;
384   PetscFunctionReturn(0);
385 }
386 
387 /*
388   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
389   This function expects the values to be roworiented. Multiple columns belong
390   to the same block-row can be inserted with a single call to this function.
391   This function extracts the sub-block of values based on the dimensions of
392   the original input block, and the row,col values corresponding to the blocks.
393 
394   Input Parameters:
395   stash  - the stash
396   row    - the global block-row correspoiding to the values
397   n      - the number of elements inserted. All elements belong to the above row.
398   idxn   - the global block-column indices corresponding to each of the blocks of
399            values. Each block is of size bs*bs.
400   values - the values inserted
401   rmax   - the number of block-rows in the original block.
402   cmax   - the number of block-columsn on the original block.
403   idx    - the index of the current block-row in the original block.
404 */
405 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
406 {
407   PetscErrorCode     ierr;
408   PetscInt           i,j,k,bs2,bs=stash->bs,l;
409   const PetscScalar  *vals;
410   PetscScalar        *array;
411   PetscMatStashSpace space=stash->space;
412 
413   PetscFunctionBegin;
414   if (!space || space->local_remaining < n) {
415     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
416   }
417   space = stash->space;
418   l     = space->local_used;
419   bs2   = bs*bs;
420   for (i=0; i<n; i++) {
421     space->idx[l] = row;
422     space->idy[l] = idxn[i];
423     /* Now copy over the block of values. Store the values column oriented.
424      This enables inserting multiple blocks belonging to a row with a single
425      funtion call */
426     array = space->val + bs2*l;
427     vals  = values + idx*bs2*n + bs*i;
428     for (j=0; j<bs; j++) {
429       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
430       array += bs;
431       vals  += rmax*bs;
432     }
433     l++;
434   }
435   stash->n               += n;
436   space->local_used      += n;
437   space->local_remaining -= n;
438   PetscFunctionReturn(0);
439 }
440 /*
441   MatStashScatterBegin_Private - Initiates the transfer of values to the
442   correct owners. This function goes through the stash, and check the
443   owners of each stashed value, and sends the values off to the owner
444   processors.
445 
446   Input Parameters:
447   stash  - the stash
448   owners - an array of size 'no-of-procs' which gives the ownership range
449            for each node.
450 
451   Notes:
452     The 'owners' array in the cased of the blocked-stash has the
453   ranges specified blocked global indices, and for the regular stash in
454   the proper global indices.
455 */
456 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
457 {
458   PetscErrorCode ierr;
459 
460   PetscFunctionBegin;
461   ierr = (*stash->ScatterBegin)(mat,stash,owners);CHKERRQ(ierr);
462   PetscFunctionReturn(0);
463 }
464 
465 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
466 {
467   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
468   PetscInt           size=stash->size,nsends;
469   PetscErrorCode     ierr;
470   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
471   PetscScalar        **rvalues,*svalues;
472   MPI_Comm           comm = stash->comm;
473   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
474   PetscMPIInt        *sizes,*nlengths,nreceives;
475   PetscInt           *sp_idx,*sp_idy;
476   PetscScalar        *sp_val;
477   PetscMatStashSpace space,space_next;
478 
479   PetscFunctionBegin;
480   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
481     InsertMode addv;
482     ierr = MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
483     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
484     mat->insertmode = addv; /* in case this processor had no cache */
485   }
486 
487   bs2 = stash->bs*stash->bs;
488 
489   /*  first count number of contributors to each processor */
490   ierr = PetscCalloc1(size,&nlengths);CHKERRQ(ierr);
491   ierr = PetscMalloc1(stash->n+1,&owner);CHKERRQ(ierr);
492 
493   i       = j    = 0;
494   lastidx = -1;
495   space   = stash->space_head;
496   while (space) {
497     space_next = space->next;
498     sp_idx     = space->idx;
499     for (l=0; l<space->local_used; l++) {
500       /* if indices are NOT locally sorted, need to start search at the beginning */
501       if (lastidx > (idx = sp_idx[l])) j = 0;
502       lastidx = idx;
503       for (; j<size; j++) {
504         if (idx >= owners[j] && idx < owners[j+1]) {
505           nlengths[j]++; owner[i] = j; break;
506         }
507       }
508       i++;
509     }
510     space = space_next;
511   }
512 
513   /* Now check what procs get messages - and compute nsends. */
514   ierr = PetscCalloc1(size,&sizes);CHKERRQ(ierr);
515   for (i=0, nsends=0; i<size; i++) {
516     if (nlengths[i]) {
517       sizes[i] = 1; nsends++;
518     }
519   }
520 
521   {PetscMPIInt *onodes,*olengths;
522    /* Determine the number of messages to expect, their lengths, from from-ids */
523    ierr = PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);CHKERRQ(ierr);
524    ierr = PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);CHKERRQ(ierr);
525    /* since clubbing row,col - lengths are multiplied by 2 */
526    for (i=0; i<nreceives; i++) olengths[i] *=2;
527    ierr = PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);CHKERRQ(ierr);
528    /* values are size 'bs2' lengths (and remove earlier factor 2 */
529    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
530    ierr = PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);CHKERRQ(ierr);
531    ierr = PetscFree(onodes);CHKERRQ(ierr);
532    ierr = PetscFree(olengths);CHKERRQ(ierr);}
533 
534   /* do sends:
535       1) starts[i] gives the starting index in svalues for stuff going to
536          the ith processor
537   */
538   ierr = PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);CHKERRQ(ierr);
539   ierr = PetscMalloc1(2*nsends,&send_waits);CHKERRQ(ierr);
540   ierr = PetscMalloc2(size,&startv,size,&starti);CHKERRQ(ierr);
541   /* use 2 sends the first with all_a, the next with all_i and all_j */
542   startv[0] = 0; starti[0] = 0;
543   for (i=1; i<size; i++) {
544     startv[i] = startv[i-1] + nlengths[i-1];
545     starti[i] = starti[i-1] + 2*nlengths[i-1];
546   }
547 
548   i     = 0;
549   space = stash->space_head;
550   while (space) {
551     space_next = space->next;
552     sp_idx     = space->idx;
553     sp_idy     = space->idy;
554     sp_val     = space->val;
555     for (l=0; l<space->local_used; l++) {
556       j = owner[i];
557       if (bs2 == 1) {
558         svalues[startv[j]] = sp_val[l];
559       } else {
560         PetscInt    k;
561         PetscScalar *buf1,*buf2;
562         buf1 = svalues+bs2*startv[j];
563         buf2 = space->val + bs2*l;
564         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
565       }
566       sindices[starti[j]]             = sp_idx[l];
567       sindices[starti[j]+nlengths[j]] = sp_idy[l];
568       startv[j]++;
569       starti[j]++;
570       i++;
571     }
572     space = space_next;
573   }
574   startv[0] = 0;
575   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
576 
577   for (i=0,count=0; i<size; i++) {
578     if (sizes[i]) {
579       ierr = MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);CHKERRMPI(ierr);
580       ierr = MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);CHKERRMPI(ierr);
581     }
582   }
583 #if defined(PETSC_USE_INFO)
584   ierr = PetscInfo1(NULL,"No of messages: %d \n",nsends);CHKERRQ(ierr);
585   for (i=0; i<size; i++) {
586     if (sizes[i]) {
587       ierr = PetscInfo2(NULL,"Mesg_to: %d: size: %d bytes\n",i,nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
588     }
589   }
590 #endif
591   ierr = PetscFree(nlengths);CHKERRQ(ierr);
592   ierr = PetscFree(owner);CHKERRQ(ierr);
593   ierr = PetscFree2(startv,starti);CHKERRQ(ierr);
594   ierr = PetscFree(sizes);CHKERRQ(ierr);
595 
596   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
597   ierr = PetscMalloc1(2*nreceives,&recv_waits);CHKERRQ(ierr);
598 
599   for (i=0; i<nreceives; i++) {
600     recv_waits[2*i]   = recv_waits1[i];
601     recv_waits[2*i+1] = recv_waits2[i];
602   }
603   stash->recv_waits = recv_waits;
604 
605   ierr = PetscFree(recv_waits1);CHKERRQ(ierr);
606   ierr = PetscFree(recv_waits2);CHKERRQ(ierr);
607 
608   stash->svalues         = svalues;
609   stash->sindices        = sindices;
610   stash->rvalues         = rvalues;
611   stash->rindices        = rindices;
612   stash->send_waits      = send_waits;
613   stash->nsends          = nsends;
614   stash->nrecvs          = nreceives;
615   stash->reproduce_count = 0;
616   PetscFunctionReturn(0);
617 }
618 
619 /*
620    MatStashScatterGetMesg_Private - This function waits on the receives posted
621    in the function MatStashScatterBegin_Private() and returns one message at
622    a time to the calling function. If no messages are left, it indicates this
623    by setting flg = 0, else it sets flg = 1.
624 
625    Input Parameters:
626    stash - the stash
627 
628    Output Parameters:
629    nvals - the number of entries in the current message.
630    rows  - an array of row indices (or blocked indices) corresponding to the values
631    cols  - an array of columnindices (or blocked indices) corresponding to the values
632    vals  - the values
633    flg   - 0 indicates no more message left, and the current call has no values associated.
634            1 indicates that the current call successfully received a message, and the
635              other output parameters nvals,rows,cols,vals are set appropriately.
636 */
637 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
638 {
639   PetscErrorCode ierr;
640 
641   PetscFunctionBegin;
642   ierr = (*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg);CHKERRQ(ierr);
643   PetscFunctionReturn(0);
644 }
645 
646 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
647 {
648   PetscErrorCode ierr;
649   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
650   PetscInt       bs2;
651   MPI_Status     recv_status;
652   PetscBool      match_found = PETSC_FALSE;
653 
654   PetscFunctionBegin;
655   *flg = 0; /* When a message is discovered this is reset to 1 */
656   /* Return if no more messages to process */
657   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
658 
659   bs2 = stash->bs*stash->bs;
660   /* If a matching pair of receives are found, process them, and return the data to
661      the calling function. Until then keep receiving messages */
662   while (!match_found) {
663     if (stash->reproduce) {
664       i    = stash->reproduce_count++;
665       ierr = MPI_Wait(stash->recv_waits+i,&recv_status);CHKERRMPI(ierr);
666     } else {
667       ierr = MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status);CHKERRMPI(ierr);
668     }
669     if (recv_status.MPI_SOURCE < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");
670 
671     /* Now pack the received message into a structure which is usable by others */
672     if (i % 2) {
673       ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,nvals);CHKERRMPI(ierr);
674 
675       flg_v[2*recv_status.MPI_SOURCE] = i/2;
676 
677       *nvals = *nvals/bs2;
678     } else {
679       ierr = MPI_Get_count(&recv_status,MPIU_INT,nvals);CHKERRMPI(ierr);
680 
681       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
682 
683       *nvals = *nvals/2; /* This message has both row indices and col indices */
684     }
685 
686     /* Check if we have both messages from this proc */
687     i1 = flg_v[2*recv_status.MPI_SOURCE];
688     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
689     if (i1 != -1 && i2 != -1) {
690       *rows = stash->rindices[i2];
691       *cols = *rows + *nvals;
692       *vals = stash->rvalues[i1];
693       *flg  = 1;
694       stash->nprocessed++;
695       match_found = PETSC_TRUE;
696     }
697   }
698   PetscFunctionReturn(0);
699 }
700 
701 #if !defined(PETSC_HAVE_MPIUNI)
702 typedef struct {
703   PetscInt row;
704   PetscInt col;
705   PetscScalar vals[1];          /* Actually an array of length bs2 */
706 } MatStashBlock;
707 
708 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
709 {
710   PetscErrorCode ierr;
711   PetscMatStashSpace space;
712   PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
713   PetscScalar **valptr;
714 
715   PetscFunctionBegin;
716   ierr = PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm);CHKERRQ(ierr);
717   for (space=stash->space_head,cnt=0; space; space=space->next) {
718     for (i=0; i<space->local_used; i++) {
719       row[cnt] = space->idx[i];
720       col[cnt] = space->idy[i];
721       valptr[cnt] = &space->val[i*bs2];
722       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
723       cnt++;
724     }
725   }
726   if (cnt != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %D, but counted %D entries",n,cnt);
727   ierr = PetscSortIntWithArrayPair(n,row,col,perm);CHKERRQ(ierr);
728   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
729   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
730     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
731       PetscInt colstart;
732       ierr = PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]);CHKERRQ(ierr);
733       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
734         PetscInt j,l;
735         MatStashBlock *block;
736         ierr = PetscSegBufferGet(stash->segsendblocks,1,&block);CHKERRQ(ierr);
737         block->row = row[rowstart];
738         block->col = col[colstart];
739         ierr = PetscArraycpy(block->vals,valptr[perm[colstart]],bs2);CHKERRQ(ierr);
740         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
741           if (insertmode == ADD_VALUES) {
742             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
743           } else {
744             ierr = PetscArraycpy(block->vals,valptr[perm[j]],bs2);CHKERRQ(ierr);
745           }
746         }
747         colstart = j;
748       }
749       rowstart = i;
750     }
751   }
752   ierr = PetscFree4(row,col,valptr,perm);CHKERRQ(ierr);
753   PetscFunctionReturn(0);
754 }
755 
756 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
757 {
758   PetscErrorCode ierr;
759 
760   PetscFunctionBegin;
761   if (stash->blocktype == MPI_DATATYPE_NULL) {
762     PetscInt     bs2 = PetscSqr(stash->bs);
763     PetscMPIInt  blocklens[2];
764     MPI_Aint     displs[2];
765     MPI_Datatype types[2],stype;
766     /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
767        std::complex itself has standard layout, so does DummyBlock, recursively.
768        To be compatiable with C++ std::complex, complex implementations on GPUs must also have standard layout,
769        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
770        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
771 
772        We can test if std::complex has standard layout with the following code:
773        #include <iostream>
774        #include <type_traits>
775        #include <complex>
776        int main() {
777          std::cout << std::boolalpha;
778          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
779        }
780        Output: true
781      */
782     struct DummyBlock {PetscInt row,col; PetscScalar vals;};
783 
784     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
785     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
786       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
787     }
788     ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);CHKERRQ(ierr);
789     ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);CHKERRQ(ierr);
790     ierr = PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);CHKERRQ(ierr);
791     blocklens[0] = 2;
792     blocklens[1] = bs2;
793     displs[0] = offsetof(struct DummyBlock,row);
794     displs[1] = offsetof(struct DummyBlock,vals);
795     types[0] = MPIU_INT;
796     types[1] = MPIU_SCALAR;
797     ierr = MPI_Type_create_struct(2,blocklens,displs,types,&stype);CHKERRMPI(ierr);
798     ierr = MPI_Type_commit(&stype);CHKERRMPI(ierr);
799     ierr = MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);CHKERRMPI(ierr);
800     ierr = MPI_Type_commit(&stash->blocktype);CHKERRMPI(ierr);
801     ierr = MPI_Type_free(&stype);CHKERRMPI(ierr);
802   }
803   PetscFunctionReturn(0);
804 }
805 
806 /* Callback invoked after target rank has initiatied receive of rendezvous message.
807  * Here we post the main sends.
808  */
809 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
810 {
811   MatStash *stash = (MatStash*)ctx;
812   MatStashHeader *hdr = (MatStashHeader*)sdata;
813   PetscErrorCode ierr;
814 
815   PetscFunctionBegin;
816   if (rank != stash->sendranks[rankid]) SETERRQ3(comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
817   ierr = MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);CHKERRMPI(ierr);
818   stash->sendframes[rankid].count = hdr->count;
819   stash->sendframes[rankid].pending = 1;
820   PetscFunctionReturn(0);
821 }
822 
823 /* Callback invoked by target after receiving rendezvous message.
824  * Here we post the main recvs.
825  */
826 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
827 {
828   MatStash *stash = (MatStash*)ctx;
829   MatStashHeader *hdr = (MatStashHeader*)rdata;
830   MatStashFrame *frame;
831   PetscErrorCode ierr;
832 
833   PetscFunctionBegin;
834   ierr = PetscSegBufferGet(stash->segrecvframe,1,&frame);CHKERRQ(ierr);
835   ierr = PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer);CHKERRQ(ierr);
836   ierr = MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);CHKERRMPI(ierr);
837   frame->count = hdr->count;
838   frame->pending = 1;
839   PetscFunctionReturn(0);
840 }
841 
842 /*
843  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
844  */
845 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
846 {
847   PetscErrorCode ierr;
848   size_t nblocks;
849   char *sendblocks;
850 
851   PetscFunctionBegin;
852   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
853     InsertMode addv;
854     ierr = MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
855     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
856   }
857 
858   ierr = MatStashBlockTypeSetUp(stash);CHKERRQ(ierr);
859   ierr = MatStashSortCompress_Private(stash,mat->insertmode);CHKERRQ(ierr);
860   ierr = PetscSegBufferGetSize(stash->segsendblocks,&nblocks);CHKERRQ(ierr);
861   ierr = PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks);CHKERRQ(ierr);
862   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
863     PetscInt i;
864     size_t b;
865     for (i=0,b=0; i<stash->nsendranks; i++) {
866       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
867       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
868       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
869       for (; b<nblocks; b++) {
870         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
871         if (PetscUnlikely(sendblock_b->row < owners[stash->sendranks[i]])) SETERRQ2(stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %D owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
872         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
873         stash->sendhdr[i].count++;
874       }
875     }
876   } else {                      /* Dynamically count and pack (first time) */
877     PetscInt sendno;
878     size_t i,rowstart;
879 
880     /* Count number of send ranks and allocate for sends */
881     stash->nsendranks = 0;
882     for (rowstart=0; rowstart<nblocks;) {
883       PetscInt owner;
884       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
885       ierr = PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);CHKERRQ(ierr);
886       if (owner < 0) owner = -(owner+2);
887       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
888         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
889         if (sendblock_i->row >= owners[owner+1]) break;
890       }
891       stash->nsendranks++;
892       rowstart = i;
893     }
894     ierr = PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes);CHKERRQ(ierr);
895 
896     /* Set up sendhdrs and sendframes */
897     sendno = 0;
898     for (rowstart=0; rowstart<nblocks;) {
899       PetscInt owner;
900       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
901       ierr = PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);CHKERRQ(ierr);
902       if (owner < 0) owner = -(owner+2);
903       stash->sendranks[sendno] = owner;
904       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
905         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
906         if (sendblock_i->row >= owners[owner+1]) break;
907       }
908       stash->sendframes[sendno].buffer = sendblock_rowstart;
909       stash->sendframes[sendno].pending = 0;
910       stash->sendhdr[sendno].count = i - rowstart;
911       sendno++;
912       rowstart = i;
913     }
914     if (sendno != stash->nsendranks) SETERRQ2(stash->comm,PETSC_ERR_PLIB,"BTS counted %D sendranks, but %D sends",stash->nsendranks,sendno);
915   }
916 
917   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
918    * message or a dummy entry of some sort. */
919   if (mat->insertmode == INSERT_VALUES) {
920     size_t i;
921     for (i=0; i<nblocks; i++) {
922       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
923       sendblock_i->row = -(sendblock_i->row+1);
924     }
925   }
926 
927   if (stash->first_assembly_done) {
928     PetscMPIInt i,tag;
929     ierr = PetscCommGetNewTag(stash->comm,&tag);CHKERRQ(ierr);
930     for (i=0; i<stash->nrecvranks; i++) {
931       ierr = MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash);CHKERRQ(ierr);
932     }
933     for (i=0; i<stash->nsendranks; i++) {
934       ierr = MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash);CHKERRQ(ierr);
935     }
936     stash->use_status = PETSC_TRUE; /* Use count from message status. */
937   } else {
938     ierr = PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
939                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
940                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);CHKERRQ(ierr);
941     ierr = PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses);CHKERRQ(ierr);
942     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
943   }
944 
945   ierr = PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes);CHKERRQ(ierr);
946   stash->recvframe_active     = NULL;
947   stash->recvframe_i          = 0;
948   stash->some_i               = 0;
949   stash->some_count           = 0;
950   stash->recvcount            = 0;
951   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
952   stash->insertmode           = &mat->insertmode;
953   PetscFunctionReturn(0);
954 }
955 
956 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
957 {
958   PetscErrorCode ierr;
959   MatStashBlock *block;
960 
961   PetscFunctionBegin;
962   *flg = 0;
963   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
964     if (stash->some_i == stash->some_count) {
965       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
966       ierr = MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
967       stash->some_i = 0;
968     }
969     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
970     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
971     if (stash->use_status) { /* Count what was actually sent */
972       ierr = MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count);CHKERRMPI(ierr);
973     }
974     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
975       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
976       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
977       if (PetscUnlikely(*stash->insertmode == INSERT_VALUES && block->row >= 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
978       if (PetscUnlikely(*stash->insertmode == ADD_VALUES && block->row < 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
979     }
980     stash->some_i++;
981     stash->recvcount++;
982     stash->recvframe_i = 0;
983   }
984   *n = 1;
985   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
986   if (block->row < 0) block->row = -(block->row + 1);
987   *row = &block->row;
988   *col = &block->col;
989   *val = block->vals;
990   stash->recvframe_i++;
991   *flg = 1;
992   PetscFunctionReturn(0);
993 }
994 
995 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
996 {
997   PetscErrorCode ierr;
998 
999   PetscFunctionBegin;
1000   ierr = MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE);CHKERRMPI(ierr);
1001   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
1002     void *dummy;
1003     ierr = PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy);CHKERRQ(ierr);
1004   } else {                      /* No reuse, so collect everything. */
1005     ierr = MatStashScatterDestroy_BTS(stash);CHKERRQ(ierr);
1006   }
1007 
1008   /* Now update nmaxold to be app 10% more than max n used, this way the
1009      wastage of space is reduced the next time this stash is used.
1010      Also update the oldmax, only if it increases */
1011   if (stash->n) {
1012     PetscInt bs2     = stash->bs*stash->bs;
1013     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
1014     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
1015   }
1016 
1017   stash->nmax       = 0;
1018   stash->n          = 0;
1019   stash->reallocs   = -1;
1020   stash->nprocessed = 0;
1021 
1022   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
1023 
1024   stash->space = NULL;
1025 
1026   PetscFunctionReturn(0);
1027 }
1028 
1029 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1030 {
1031   PetscErrorCode ierr;
1032 
1033   PetscFunctionBegin;
1034   ierr = PetscSegBufferDestroy(&stash->segsendblocks);CHKERRQ(ierr);
1035   ierr = PetscSegBufferDestroy(&stash->segrecvframe);CHKERRQ(ierr);
1036   stash->recvframes = NULL;
1037   ierr = PetscSegBufferDestroy(&stash->segrecvblocks);CHKERRQ(ierr);
1038   if (stash->blocktype != MPI_DATATYPE_NULL) {
1039     ierr = MPI_Type_free(&stash->blocktype);CHKERRMPI(ierr);
1040   }
1041   stash->nsendranks = 0;
1042   stash->nrecvranks = 0;
1043   ierr = PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes);CHKERRQ(ierr);
1044   ierr = PetscFree(stash->sendreqs);CHKERRQ(ierr);
1045   ierr = PetscFree(stash->recvreqs);CHKERRQ(ierr);
1046   ierr = PetscFree(stash->recvranks);CHKERRQ(ierr);
1047   ierr = PetscFree(stash->recvhdr);CHKERRQ(ierr);
1048   ierr = PetscFree2(stash->some_indices,stash->some_statuses);CHKERRQ(ierr);
1049   PetscFunctionReturn(0);
1050 }
1051 #endif
1052