xref: /petsc/src/mat/utils/matstash.c (revision a6404fbfb1cfbf30d2bac9856cef3bf7411483d5)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE   10000
5 
6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7 static PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8 static PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
10 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
11 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
12 static PetscErrorCode MatStashScatterDestroy_BTS(MatStash*);
13 
14 /*
15   MatStashCreate_Private - Creates a stash,currently used for all the parallel
16   matrix implementations. The stash is where elements of a matrix destined
17   to be stored on other processors are kept until matrix assembly is done.
18 
19   This is a simple minded stash. Simply adds entries to end of stash.
20 
21   Input Parameters:
22   comm - communicator, required for scatters.
23   bs   - stash block size. used when stashing blocks of values
24 
25   Output Parameters:
26   stash    - the newly created stash
27 */
28 #undef __FUNCT__
29 #define __FUNCT__ "MatStashCreate_Private"
30 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
31 {
32   PetscErrorCode ierr;
33   PetscInt       max,*opt,nopt,i;
34   PetscBool      flg;
35 
36   PetscFunctionBegin;
37   /* Require 2 tags,get the second using PetscCommGetNewTag() */
38   stash->comm = comm;
39 
40   ierr = PetscCommGetNewTag(stash->comm,&stash->tag1);CHKERRQ(ierr);
41   ierr = PetscCommGetNewTag(stash->comm,&stash->tag2);CHKERRQ(ierr);
42   ierr = MPI_Comm_size(stash->comm,&stash->size);CHKERRQ(ierr);
43   ierr = MPI_Comm_rank(stash->comm,&stash->rank);CHKERRQ(ierr);
44   ierr = PetscMalloc1(2*stash->size,&stash->flg_v);CHKERRQ(ierr);
45   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
46 
47 
48   nopt = stash->size;
49   ierr = PetscMalloc1(nopt,&opt);CHKERRQ(ierr);
50   ierr = PetscOptionsGetIntArray(NULL,"-matstash_initial_size",opt,&nopt,&flg);CHKERRQ(ierr);
51   if (flg) {
52     if (nopt == 1)                max = opt[0];
53     else if (nopt == stash->size) max = opt[stash->rank];
54     else if (stash->rank < nopt)  max = opt[stash->rank];
55     else                          max = 0; /* Use default */
56     stash->umax = max;
57   } else {
58     stash->umax = 0;
59   }
60   ierr = PetscFree(opt);CHKERRQ(ierr);
61   if (bs <= 0) bs = 1;
62 
63   stash->bs         = bs;
64   stash->nmax       = 0;
65   stash->oldnmax    = 0;
66   stash->n          = 0;
67   stash->reallocs   = -1;
68   stash->space_head = 0;
69   stash->space      = 0;
70 
71   stash->send_waits  = 0;
72   stash->recv_waits  = 0;
73   stash->send_status = 0;
74   stash->nsends      = 0;
75   stash->nrecvs      = 0;
76   stash->svalues     = 0;
77   stash->rvalues     = 0;
78   stash->rindices    = 0;
79   stash->nprocessed  = 0;
80   stash->reproduce   = PETSC_FALSE;
81   stash->blocktype   = MPI_DATATYPE_NULL;
82 
83   ierr = PetscOptionsGetBool(NULL,"-matstash_reproduce",&stash->reproduce,NULL);CHKERRQ(ierr);
84   ierr = PetscOptionsGetBool(NULL,"-matstash_bts",&flg,NULL);CHKERRQ(ierr);
85   if (flg) {
86     stash->ScatterBegin   = MatStashScatterBegin_BTS;
87     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
88     stash->ScatterEnd     = MatStashScatterEnd_BTS;
89     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
90   } else {
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95   }
96   PetscFunctionReturn(0);
97 }
98 
99 /*
100    MatStashDestroy_Private - Destroy the stash
101 */
102 #undef __FUNCT__
103 #define __FUNCT__ "MatStashDestroy_Private"
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscErrorCode ierr;
107 
108   PetscFunctionBegin;
109   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
110   if (stash->ScatterDestroy) {ierr = (*stash->ScatterDestroy)(stash);CHKERRQ(ierr);}
111 
112   stash->space = 0;
113 
114   ierr = PetscFree(stash->flg_v);CHKERRQ(ierr);
115   PetscFunctionReturn(0);
116 }
117 
118 /*
119    MatStashScatterEnd_Private - This is called as the final stage of
120    scatter. The final stages of message passing is done here, and
121    all the memory used for message passing is cleaned up. This
122    routine also resets the stash, and deallocates the memory used
123    for the stash. It also keeps track of the current memory usage
124    so that the same value can be used the next time through.
125 */
126 #undef __FUNCT__
127 #define __FUNCT__ "MatStashScatterEnd_Private"
128 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
129 {
130   PetscErrorCode ierr;
131 
132   PetscFunctionBegin;
133   ierr = (*stash->ScatterEnd)(stash);CHKERRQ(ierr);
134   PetscFunctionReturn(0);
135 }
136 
137 #undef __FUNCT__
138 #define __FUNCT__ "MatStashScatterEnd_Ref"
139 static PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
140 {
141   PetscErrorCode ierr;
142   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
143   MPI_Status     *send_status;
144 
145   PetscFunctionBegin;
146   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
147   /* wait on sends */
148   if (nsends) {
149     ierr = PetscMalloc1(2*nsends,&send_status);CHKERRQ(ierr);
150     ierr = MPI_Waitall(2*nsends,stash->send_waits,send_status);CHKERRQ(ierr);
151     ierr = PetscFree(send_status);CHKERRQ(ierr);
152   }
153 
154   /* Now update nmaxold to be app 10% more than max n used, this way the
155      wastage of space is reduced the next time this stash is used.
156      Also update the oldmax, only if it increases */
157   if (stash->n) {
158     bs2     = stash->bs*stash->bs;
159     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
160     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
161   }
162 
163   stash->nmax       = 0;
164   stash->n          = 0;
165   stash->reallocs   = -1;
166   stash->nprocessed = 0;
167 
168   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
169 
170   stash->space = 0;
171 
172   ierr = PetscFree(stash->send_waits);CHKERRQ(ierr);
173   ierr = PetscFree(stash->recv_waits);CHKERRQ(ierr);
174   ierr = PetscFree2(stash->svalues,stash->sindices);CHKERRQ(ierr);
175   ierr = PetscFree(stash->rvalues[0]);CHKERRQ(ierr);
176   ierr = PetscFree(stash->rvalues);CHKERRQ(ierr);
177   ierr = PetscFree(stash->rindices[0]);CHKERRQ(ierr);
178   ierr = PetscFree(stash->rindices);CHKERRQ(ierr);
179   PetscFunctionReturn(0);
180 }
181 
182 /*
183    MatStashGetInfo_Private - Gets the relavant statistics of the stash
184 
185    Input Parameters:
186    stash    - the stash
187    nstash   - the size of the stash. Indicates the number of values stored.
188    reallocs - the number of additional mallocs incurred.
189 
190 */
191 #undef __FUNCT__
192 #define __FUNCT__ "MatStashGetInfo_Private"
193 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
194 {
195   PetscInt bs2 = stash->bs*stash->bs;
196 
197   PetscFunctionBegin;
198   if (nstash) *nstash = stash->n*bs2;
199   if (reallocs) {
200     if (stash->reallocs < 0) *reallocs = 0;
201     else                     *reallocs = stash->reallocs;
202   }
203   PetscFunctionReturn(0);
204 }
205 
206 /*
207    MatStashSetInitialSize_Private - Sets the initial size of the stash
208 
209    Input Parameters:
210    stash  - the stash
211    max    - the value that is used as the max size of the stash.
212             this value is used while allocating memory.
213 */
214 #undef __FUNCT__
215 #define __FUNCT__ "MatStashSetInitialSize_Private"
216 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
217 {
218   PetscFunctionBegin;
219   stash->umax = max;
220   PetscFunctionReturn(0);
221 }
222 
223 /* MatStashExpand_Private - Expand the stash. This function is called
224    when the space in the stash is not sufficient to add the new values
225    being inserted into the stash.
226 
227    Input Parameters:
228    stash - the stash
229    incr  - the minimum increase requested
230 
231    Notes:
232    This routine doubles the currently used memory.
233  */
234 #undef __FUNCT__
235 #define __FUNCT__ "MatStashExpand_Private"
236 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
237 {
238   PetscErrorCode ierr;
239   PetscInt       newnmax,bs2= stash->bs*stash->bs;
240 
241   PetscFunctionBegin;
242   /* allocate a larger stash */
243   if (!stash->oldnmax && !stash->nmax) { /* new stash */
244     if (stash->umax)                  newnmax = stash->umax/bs2;
245     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
246   } else if (!stash->nmax) { /* resuing stash */
247     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
248     else                              newnmax = stash->oldnmax/bs2;
249   } else                              newnmax = stash->nmax*2;
250   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;
251 
252   /* Get a MatStashSpace and attach it to stash */
253   ierr = PetscMatStashSpaceGet(bs2,newnmax,&stash->space);CHKERRQ(ierr);
254   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
255     stash->space_head = stash->space;
256   }
257 
258   stash->reallocs++;
259   stash->nmax = newnmax;
260   PetscFunctionReturn(0);
261 }
262 /*
263   MatStashValuesRow_Private - inserts values into the stash. This function
264   expects the values to be roworiented. Multiple columns belong to the same row
265   can be inserted with a single call to this function.
266 
267   Input Parameters:
268   stash  - the stash
269   row    - the global row correspoiding to the values
270   n      - the number of elements inserted. All elements belong to the above row.
271   idxn   - the global column indices corresponding to each of the values.
272   values - the values inserted
273 */
274 #undef __FUNCT__
275 #define __FUNCT__ "MatStashValuesRow_Private"
276 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
277 {
278   PetscErrorCode     ierr;
279   PetscInt           i,k,cnt = 0;
280   PetscMatStashSpace space=stash->space;
281 
282   PetscFunctionBegin;
283   /* Check and see if we have sufficient memory */
284   if (!space || space->local_remaining < n) {
285     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
286   }
287   space = stash->space;
288   k     = space->local_used;
289   for (i=0; i<n; i++) {
290     if (ignorezeroentries && (values[i] == 0.0)) continue;
291     space->idx[k] = row;
292     space->idy[k] = idxn[i];
293     space->val[k] = values[i];
294     k++;
295     cnt++;
296   }
297   stash->n               += cnt;
298   space->local_used      += cnt;
299   space->local_remaining -= cnt;
300   PetscFunctionReturn(0);
301 }
302 
303 /*
304   MatStashValuesCol_Private - inserts values into the stash. This function
305   expects the values to be columnoriented. Multiple columns belong to the same row
306   can be inserted with a single call to this function.
307 
308   Input Parameters:
309   stash   - the stash
310   row     - the global row correspoiding to the values
311   n       - the number of elements inserted. All elements belong to the above row.
312   idxn    - the global column indices corresponding to each of the values.
313   values  - the values inserted
314   stepval - the consecutive values are sepated by a distance of stepval.
315             this happens because the input is columnoriented.
316 */
317 #undef __FUNCT__
318 #define __FUNCT__ "MatStashValuesCol_Private"
319 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
320 {
321   PetscErrorCode     ierr;
322   PetscInt           i,k,cnt = 0;
323   PetscMatStashSpace space=stash->space;
324 
325   PetscFunctionBegin;
326   /* Check and see if we have sufficient memory */
327   if (!space || space->local_remaining < n) {
328     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
329   }
330   space = stash->space;
331   k     = space->local_used;
332   for (i=0; i<n; i++) {
333     if (ignorezeroentries && (values[i*stepval] == 0.0)) continue;
334     space->idx[k] = row;
335     space->idy[k] = idxn[i];
336     space->val[k] = values[i*stepval];
337     k++;
338     cnt++;
339   }
340   stash->n               += cnt;
341   space->local_used      += cnt;
342   space->local_remaining -= cnt;
343   PetscFunctionReturn(0);
344 }
345 
346 /*
347   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
348   This function expects the values to be roworiented. Multiple columns belong
349   to the same block-row can be inserted with a single call to this function.
350   This function extracts the sub-block of values based on the dimensions of
351   the original input block, and the row,col values corresponding to the blocks.
352 
353   Input Parameters:
354   stash  - the stash
355   row    - the global block-row correspoiding to the values
356   n      - the number of elements inserted. All elements belong to the above row.
357   idxn   - the global block-column indices corresponding to each of the blocks of
358            values. Each block is of size bs*bs.
359   values - the values inserted
360   rmax   - the number of block-rows in the original block.
361   cmax   - the number of block-columsn on the original block.
362   idx    - the index of the current block-row in the original block.
363 */
364 #undef __FUNCT__
365 #define __FUNCT__ "MatStashValuesRowBlocked_Private"
366 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
367 {
368   PetscErrorCode     ierr;
369   PetscInt           i,j,k,bs2,bs=stash->bs,l;
370   const PetscScalar  *vals;
371   PetscScalar        *array;
372   PetscMatStashSpace space=stash->space;
373 
374   PetscFunctionBegin;
375   if (!space || space->local_remaining < n) {
376     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
377   }
378   space = stash->space;
379   l     = space->local_used;
380   bs2   = bs*bs;
381   for (i=0; i<n; i++) {
382     space->idx[l] = row;
383     space->idy[l] = idxn[i];
384     /* Now copy over the block of values. Store the values column oriented.
385        This enables inserting multiple blocks belonging to a row with a single
386        funtion call */
387     array = space->val + bs2*l;
388     vals  = values + idx*bs2*n + bs*i;
389     for (j=0; j<bs; j++) {
390       for (k=0; k<bs; k++) array[k*bs] = vals[k];
391       array++;
392       vals += cmax*bs;
393     }
394     l++;
395   }
396   stash->n               += n;
397   space->local_used      += n;
398   space->local_remaining -= n;
399   PetscFunctionReturn(0);
400 }
401 
402 /*
403   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
404   This function expects the values to be roworiented. Multiple columns belong
405   to the same block-row can be inserted with a single call to this function.
406   This function extracts the sub-block of values based on the dimensions of
407   the original input block, and the row,col values corresponding to the blocks.
408 
409   Input Parameters:
410   stash  - the stash
411   row    - the global block-row correspoiding to the values
412   n      - the number of elements inserted. All elements belong to the above row.
413   idxn   - the global block-column indices corresponding to each of the blocks of
414            values. Each block is of size bs*bs.
415   values - the values inserted
416   rmax   - the number of block-rows in the original block.
417   cmax   - the number of block-columsn on the original block.
418   idx    - the index of the current block-row in the original block.
419 */
420 #undef __FUNCT__
421 #define __FUNCT__ "MatStashValuesColBlocked_Private"
422 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
423 {
424   PetscErrorCode     ierr;
425   PetscInt           i,j,k,bs2,bs=stash->bs,l;
426   const PetscScalar  *vals;
427   PetscScalar        *array;
428   PetscMatStashSpace space=stash->space;
429 
430   PetscFunctionBegin;
431   if (!space || space->local_remaining < n) {
432     ierr = MatStashExpand_Private(stash,n);CHKERRQ(ierr);
433   }
434   space = stash->space;
435   l     = space->local_used;
436   bs2   = bs*bs;
437   for (i=0; i<n; i++) {
438     space->idx[l] = row;
439     space->idy[l] = idxn[i];
440     /* Now copy over the block of values. Store the values column oriented.
441      This enables inserting multiple blocks belonging to a row with a single
442      funtion call */
443     array = space->val + bs2*l;
444     vals  = values + idx*bs2*n + bs*i;
445     for (j=0; j<bs; j++) {
446       for (k=0; k<bs; k++) array[k] = vals[k];
447       array += bs;
448       vals  += rmax*bs;
449     }
450     l++;
451   }
452   stash->n               += n;
453   space->local_used      += n;
454   space->local_remaining -= n;
455   PetscFunctionReturn(0);
456 }
457 /*
458   MatStashScatterBegin_Private - Initiates the transfer of values to the
459   correct owners. This function goes through the stash, and check the
460   owners of each stashed value, and sends the values off to the owner
461   processors.
462 
463   Input Parameters:
464   stash  - the stash
465   owners - an array of size 'no-of-procs' which gives the ownership range
466            for each node.
467 
468   Notes: The 'owners' array in the cased of the blocked-stash has the
469   ranges specified blocked global indices, and for the regular stash in
470   the proper global indices.
471 */
472 #undef __FUNCT__
473 #define __FUNCT__ "MatStashScatterBegin_Private"
474 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
475 {
476   PetscErrorCode ierr;
477 
478   PetscFunctionBegin;
479   ierr = (*stash->ScatterBegin)(mat,stash,owners);CHKERRQ(ierr);
480   PetscFunctionReturn(0);
481 }
482 
483 #undef __FUNCT__
484 #define __FUNCT__ "MatStashScatterBegin_Ref"
485 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
486 {
487   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
488   PetscInt           size=stash->size,nsends;
489   PetscErrorCode     ierr;
490   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
491   PetscScalar        **rvalues,*svalues;
492   MPI_Comm           comm = stash->comm;
493   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
494   PetscMPIInt        *sizes,*nlengths,nreceives;
495   PetscInt           *sp_idx,*sp_idy;
496   PetscScalar        *sp_val;
497   PetscMatStashSpace space,space_next;
498 
499   PetscFunctionBegin;
500   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
501     InsertMode addv;
502     ierr = MPI_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr);
503     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
504     mat->insertmode = addv; /* in case this processor had no cache */
505   }
506 
507   bs2 = stash->bs*stash->bs;
508 
509   /*  first count number of contributors to each processor */
510   ierr = PetscCalloc1(size,&sizes);CHKERRQ(ierr);
511   ierr = PetscCalloc1(size,&nlengths);CHKERRQ(ierr);
512   ierr = PetscMalloc1(stash->n+1,&owner);CHKERRQ(ierr);
513 
514   i       = j    = 0;
515   lastidx = -1;
516   space   = stash->space_head;
517   while (space != NULL) {
518     space_next = space->next;
519     sp_idx     = space->idx;
520     for (l=0; l<space->local_used; l++) {
521       /* if indices are NOT locally sorted, need to start search at the beginning */
522       if (lastidx > (idx = sp_idx[l])) j = 0;
523       lastidx = idx;
524       for (; j<size; j++) {
525         if (idx >= owners[j] && idx < owners[j+1]) {
526           nlengths[j]++; owner[i] = j; break;
527         }
528       }
529       i++;
530     }
531     space = space_next;
532   }
533   /* Now check what procs get messages - and compute nsends. */
534   for (i=0, nsends=0; i<size; i++) {
535     if (nlengths[i]) {
536       sizes[i] = 1; nsends++;
537     }
538   }
539 
540   {PetscMPIInt *onodes,*olengths;
541    /* Determine the number of messages to expect, their lengths, from from-ids */
542    ierr = PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives);CHKERRQ(ierr);
543    ierr = PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths);CHKERRQ(ierr);
544    /* since clubbing row,col - lengths are multiplied by 2 */
545    for (i=0; i<nreceives; i++) olengths[i] *=2;
546    ierr = PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1);CHKERRQ(ierr);
547    /* values are size 'bs2' lengths (and remove earlier factor 2 */
548    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
549    ierr = PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2);CHKERRQ(ierr);
550    ierr = PetscFree(onodes);CHKERRQ(ierr);
551    ierr = PetscFree(olengths);CHKERRQ(ierr);}
552 
553   /* do sends:
554       1) starts[i] gives the starting index in svalues for stuff going to
555          the ith processor
556   */
557   ierr = PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices);CHKERRQ(ierr);
558   ierr = PetscMalloc1(2*nsends,&send_waits);CHKERRQ(ierr);
559   ierr = PetscMalloc2(size,&startv,size,&starti);CHKERRQ(ierr);
560   /* use 2 sends the first with all_a, the next with all_i and all_j */
561   startv[0] = 0; starti[0] = 0;
562   for (i=1; i<size; i++) {
563     startv[i] = startv[i-1] + nlengths[i-1];
564     starti[i] = starti[i-1] + 2*nlengths[i-1];
565   }
566 
567   i     = 0;
568   space = stash->space_head;
569   while (space != NULL) {
570     space_next = space->next;
571     sp_idx     = space->idx;
572     sp_idy     = space->idy;
573     sp_val     = space->val;
574     for (l=0; l<space->local_used; l++) {
575       j = owner[i];
576       if (bs2 == 1) {
577         svalues[startv[j]] = sp_val[l];
578       } else {
579         PetscInt    k;
580         PetscScalar *buf1,*buf2;
581         buf1 = svalues+bs2*startv[j];
582         buf2 = space->val + bs2*l;
583         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
584       }
585       sindices[starti[j]]             = sp_idx[l];
586       sindices[starti[j]+nlengths[j]] = sp_idy[l];
587       startv[j]++;
588       starti[j]++;
589       i++;
590     }
591     space = space_next;
592   }
593   startv[0] = 0;
594   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
595 
596   for (i=0,count=0; i<size; i++) {
597     if (sizes[i]) {
598       ierr = MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++);CHKERRQ(ierr);
599       ierr = MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++);CHKERRQ(ierr);
600     }
601   }
602 #if defined(PETSC_USE_INFO)
603   ierr = PetscInfo1(NULL,"No of messages: %d \n",nsends);CHKERRQ(ierr);
604   for (i=0; i<size; i++) {
605     if (sizes[i]) {
606       ierr = PetscInfo2(NULL,"Mesg_to: %d: size: %d bytes\n",i,nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)));CHKERRQ(ierr);
607     }
608   }
609 #endif
610   ierr = PetscFree(nlengths);CHKERRQ(ierr);
611   ierr = PetscFree(owner);CHKERRQ(ierr);
612   ierr = PetscFree2(startv,starti);CHKERRQ(ierr);
613   ierr = PetscFree(sizes);CHKERRQ(ierr);
614 
615   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
616   ierr = PetscMalloc1(2*nreceives,&recv_waits);CHKERRQ(ierr);
617 
618   for (i=0; i<nreceives; i++) {
619     recv_waits[2*i]   = recv_waits1[i];
620     recv_waits[2*i+1] = recv_waits2[i];
621   }
622   stash->recv_waits = recv_waits;
623 
624   ierr = PetscFree(recv_waits1);CHKERRQ(ierr);
625   ierr = PetscFree(recv_waits2);CHKERRQ(ierr);
626 
627   stash->svalues         = svalues;
628   stash->sindices        = sindices;
629   stash->rvalues         = rvalues;
630   stash->rindices        = rindices;
631   stash->send_waits      = send_waits;
632   stash->nsends          = nsends;
633   stash->nrecvs          = nreceives;
634   stash->reproduce_count = 0;
635   PetscFunctionReturn(0);
636 }
637 
638 /*
639    MatStashScatterGetMesg_Private - This function waits on the receives posted
640    in the function MatStashScatterBegin_Private() and returns one message at
641    a time to the calling function. If no messages are left, it indicates this
642    by setting flg = 0, else it sets flg = 1.
643 
644    Input Parameters:
645    stash - the stash
646 
647    Output Parameters:
648    nvals - the number of entries in the current message.
649    rows  - an array of row indices (or blocked indices) corresponding to the values
650    cols  - an array of columnindices (or blocked indices) corresponding to the values
651    vals  - the values
652    flg   - 0 indicates no more message left, and the current call has no values associated.
653            1 indicates that the current call successfully received a message, and the
654              other output parameters nvals,rows,cols,vals are set appropriately.
655 */
656 #undef __FUNCT__
657 #define __FUNCT__ "MatStashScatterGetMesg_Private"
658 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
659 {
660   PetscErrorCode ierr;
661 
662   PetscFunctionBegin;
663   ierr = (*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg);CHKERRQ(ierr);
664   PetscFunctionReturn(0);
665 }
666 
667 #undef __FUNCT__
668 #define __FUNCT__ "MatStashScatterGetMesg_Ref"
669 static PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
670 {
671   PetscErrorCode ierr;
672   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
673   PetscInt       bs2;
674   MPI_Status     recv_status;
675   PetscBool      match_found = PETSC_FALSE;
676 
677   PetscFunctionBegin;
678   *flg = 0; /* When a message is discovered this is reset to 1 */
679   /* Return if no more messages to process */
680   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
681 
682   bs2 = stash->bs*stash->bs;
683   /* If a matching pair of receives are found, process them, and return the data to
684      the calling function. Until then keep receiving messages */
685   while (!match_found) {
686     if (stash->reproduce) {
687       i    = stash->reproduce_count++;
688       ierr = MPI_Wait(stash->recv_waits+i,&recv_status);CHKERRQ(ierr);
689     } else {
690       ierr = MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status);CHKERRQ(ierr);
691     }
692     if (recv_status.MPI_SOURCE < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");
693 
694     /* Now pack the received message into a structure which is usable by others */
695     if (i % 2) {
696       ierr = MPI_Get_count(&recv_status,MPIU_SCALAR,nvals);CHKERRQ(ierr);
697 
698       flg_v[2*recv_status.MPI_SOURCE] = i/2;
699 
700       *nvals = *nvals/bs2;
701     } else {
702       ierr = MPI_Get_count(&recv_status,MPIU_INT,nvals);CHKERRQ(ierr);
703 
704       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
705 
706       *nvals = *nvals/2; /* This message has both row indices and col indices */
707     }
708 
709     /* Check if we have both messages from this proc */
710     i1 = flg_v[2*recv_status.MPI_SOURCE];
711     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
712     if (i1 != -1 && i2 != -1) {
713       *rows = stash->rindices[i2];
714       *cols = *rows + *nvals;
715       *vals = stash->rvalues[i1];
716       *flg  = 1;
717       stash->nprocessed++;
718       match_found = PETSC_TRUE;
719     }
720   }
721   PetscFunctionReturn(0);
722 }
723 
724 typedef struct {
725   PetscInt row;
726   PetscInt col;
727   PetscScalar vals[1];          /* Actually an array of length bs2 */
728 } MatStashBlock;
729 
730 #undef __FUNCT__
731 #define __FUNCT__ "MatStashSortCompress_Private"
732 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
733 {
734   PetscErrorCode ierr;
735   PetscMatStashSpace space;
736   PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
737   PetscScalar **valptr;
738 
739   PetscFunctionBegin;
740   ierr = PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm);CHKERRQ(ierr);
741   for (space=stash->space_head,cnt=0; space; space=space->next) {
742     for (i=0; i<space->local_used; i++) {
743       row[cnt] = space->idx[i];
744       col[cnt] = space->idy[i];
745       valptr[cnt] = &space->val[i*bs2];
746       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
747       cnt++;
748     }
749   }
750   if (cnt != n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %D, but counted %D entries",n,cnt);
751   ierr = PetscSortIntWithArrayPair(n,row,col,perm);CHKERRQ(ierr);
752   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
753   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
754     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
755       PetscInt colstart;
756       ierr = PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]);CHKERRQ(ierr);
757       for (colstart=rowstart; colstart<i; ) { /* Compress multiple insertions to the same location */
758         PetscInt j,l;
759         MatStashBlock *block;
760         ierr = PetscSegBufferGet(stash->segsendblocks,1,&block);CHKERRQ(ierr);
761         block->row = row[rowstart];
762         block->col = col[colstart];
763         ierr = PetscMemcpy(block->vals,valptr[perm[colstart]],bs2*sizeof(block->vals[0]));CHKERRQ(ierr);
764         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
765           if (insertmode == ADD_VALUES) {
766             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
767           } else {
768             ierr = PetscMemcpy(block->vals,valptr[perm[j]],bs2*sizeof(block->vals[0]));CHKERRQ(ierr);
769           }
770         }
771         colstart = j;
772       }
773       rowstart = i;
774     }
775   }
776   ierr = PetscFree4(row,col,valptr,perm);CHKERRQ(ierr);
777   PetscFunctionReturn(0);
778 }
779 
780 #undef __FUNCT__
781 #define __FUNCT__ "MatStashBlockTypeSetUp"
782 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
783 {
784   PetscErrorCode ierr;
785 
786   PetscFunctionBegin;
787   if (stash->blocktype == MPI_DATATYPE_NULL) {
788     PetscInt     bs2 = PetscSqr(stash->bs);
789     PetscMPIInt  blocklens[2];
790     MPI_Aint     displs[2];
791     MPI_Datatype types[2],stype;
792     /* C++ std::complex is not my favorite datatype.  Since it is not POD, we cannot use offsetof to find the offset of
793      * vals.  But the layout is actually guaranteed by the standard, so we do a little dance here with struct
794      * DummyBlock, substituting PetscReal for PetscComplex so that we can determine the offset.
795      */
796     struct DummyBlock {PetscInt row,col; PetscReal vals;};
797 
798     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
799     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
800       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
801     }
802     ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);CHKERRQ(ierr);
803     ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);CHKERRQ(ierr);
804     ierr = PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);CHKERRQ(ierr);
805     blocklens[0] = 2;
806     blocklens[1] = bs2;
807     displs[0] = offsetof(struct DummyBlock,row);
808     displs[1] = offsetof(struct DummyBlock,vals);
809     types[0] = MPIU_INT;
810     types[1] = MPIU_SCALAR;
811     ierr = MPI_Type_create_struct(2,blocklens,displs,types,&stype);CHKERRQ(ierr);
812     ierr = MPI_Type_commit(&stype);CHKERRQ(ierr);
813     ierr = MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);CHKERRQ(ierr); /* MPI-2 */
814     ierr = MPI_Type_commit(&stash->blocktype);CHKERRQ(ierr);
815     ierr = MPI_Type_free(&stype);CHKERRQ(ierr);
816   }
817   PetscFunctionReturn(0);
818 }
819 
820 #undef __FUNCT__
821 #define __FUNCT__ "MatStashBTSSend_Private"
822 /* Callback invoked after target rank has initiatied receive of rendezvous message.
823  * Here we post the main sends.
824  */
825 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
826 {
827   MatStash *stash = (MatStash*)ctx;
828   MatStashHeader *hdr = (MatStashHeader*)sdata;
829   PetscErrorCode ierr;
830 
831   PetscFunctionBegin;
832   if (rank != stash->sendranks[rankid]) SETERRQ3(comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
833   ierr = MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);CHKERRQ(ierr);
834   stash->sendframes[rankid].count = hdr->count;
835   stash->sendframes[rankid].pending = 1;
836   PetscFunctionReturn(0);
837 }
838 
839 #undef __FUNCT__
840 #define __FUNCT__ "MatStashBTSRecv_Private"
841 /* Callback invoked by target after receiving rendezvous message.
842  * Here we post the main recvs.
843  */
844 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
845 {
846   MatStash *stash = (MatStash*)ctx;
847   MatStashHeader *hdr = (MatStashHeader*)rdata;
848   MatStashFrame *frame;
849   PetscErrorCode ierr;
850 
851   PetscFunctionBegin;
852   ierr = PetscSegBufferGet(stash->segrecvframe,1,&frame);CHKERRQ(ierr);
853   ierr = PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer);CHKERRQ(ierr);
854   ierr = MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]);CHKERRQ(ierr);
855   frame->count = hdr->count;
856   frame->pending = 1;
857   PetscFunctionReturn(0);
858 }
859 
860 #undef __FUNCT__
861 #define __FUNCT__ "MatStashScatterBegin_BTS"
862 /*
863  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
864  */
865 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
866 {
867   PetscErrorCode ierr;
868   size_t nblocks;
869   char *sendblocks;
870 
871   PetscFunctionBegin;
872 #if defined(PETSC_USE_DEBUG)
873   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
874     InsertMode addv;
875     ierr = MPI_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat));CHKERRQ(ierr);
876     if (addv == (ADD_VALUES|INSERT_VALUES)) SETERRQ(PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
877   }
878 #endif
879 
880   if (stash->subset_off_proc && !mat->subsetoffprocentries) { /* We won't use the old scatter context. */
881     ierr = MatStashScatterDestroy_BTS(stash);CHKERRQ(ierr);
882   }
883 
884   ierr = MatStashBlockTypeSetUp(stash);CHKERRQ(ierr);
885   ierr = MatStashSortCompress_Private(stash,mat->insertmode);CHKERRQ(ierr);
886   ierr = PetscSegBufferGetSize(stash->segsendblocks,&nblocks);CHKERRQ(ierr);
887   ierr = PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks);CHKERRQ(ierr);
888   if (stash->subset_off_proc && mat->subsetoffprocentries) { /* Set up sendhdrs and sendframes for each rank that we sent before */
889     PetscInt i;
890     size_t b;
891     for (i=0,b=0; i<stash->nsendranks; i++) {
892       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
893       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
894       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
895       for ( ; b<nblocks; b++) {
896         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
897         if (PetscUnlikely(sendblock_b->row < owners[stash->sendranks[i]])) SETERRQ2(stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %D owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
898         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
899         stash->sendhdr[i].count++;
900       }
901     }
902   } else {                      /* Dynamically count and pack (first time) */
903     PetscInt sendno;
904     size_t i,rowstart;
905 
906     /* Count number of send ranks and allocate for sends */
907     stash->nsendranks = 0;
908     for (rowstart=0; rowstart<nblocks; ) {
909       PetscInt owner;
910       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
911       ierr = PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);CHKERRQ(ierr);
912       if (owner < 0) owner = -(owner+2);
913       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
914         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
915         if (sendblock_i->row >= owners[owner+1]) break;
916       }
917       stash->nsendranks++;
918       rowstart = i;
919     }
920     ierr = PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes);CHKERRQ(ierr);
921 
922     /* Set up sendhdrs and sendframes */
923     sendno = 0;
924     for (rowstart=0; rowstart<nblocks; ) {
925       PetscInt owner;
926       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
927       ierr = PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner);CHKERRQ(ierr);
928       if (owner < 0) owner = -(owner+2);
929       stash->sendranks[sendno] = owner;
930       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
931         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
932         if (sendblock_i->row >= owners[owner+1]) break;
933       }
934       stash->sendframes[sendno].buffer = sendblock_rowstart;
935       stash->sendframes[sendno].pending = 0;
936       stash->sendhdr[sendno].count = i - rowstart;
937       sendno++;
938       rowstart = i;
939     }
940     if (sendno != stash->nsendranks) SETERRQ2(stash->comm,PETSC_ERR_PLIB,"BTS counted %D sendranks, but %D sends",stash->nsendranks,sendno);
941   }
942 
943   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
944    * message or a dummy entry of some sort. */
945   if (mat->insertmode == INSERT_VALUES) {
946     size_t i;
947     for (i=0; i<nblocks; i++) {
948       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
949       sendblock_i->row = -(sendblock_i->row+1);
950     }
951   }
952 
953   if (stash->subset_off_proc && mat->subsetoffprocentries) {
954     PetscMPIInt i,tag;
955     ierr = PetscCommGetNewTag(stash->comm,&tag);CHKERRQ(ierr);
956     for (i=0; i<stash->nrecvranks; i++) {
957       ierr = MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash);CHKERRQ(ierr);
958     }
959     for (i=0; i<stash->nsendranks; i++) {
960       ierr = MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash);CHKERRQ(ierr);
961     }
962     stash->use_status = PETSC_TRUE; /* Use count from message status. */
963   } else {
964     ierr = PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
965                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
966                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);CHKERRQ(ierr);
967     ierr = PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses);CHKERRQ(ierr);
968     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
969   }
970 
971   ierr = PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes);CHKERRQ(ierr);
972   stash->recvframe_active = NULL;
973   stash->recvframe_i      = 0;
974   stash->some_i           = 0;
975   stash->some_count       = 0;
976   stash->recvcount        = 0;
977   stash->subset_off_proc  = mat->subsetoffprocentries;
978   stash->insertmode       = &mat->insertmode;
979   PetscFunctionReturn(0);
980 }
981 
982 #undef __FUNCT__
983 #define __FUNCT__ "MatStashScatterGetMesg_BTS"
984 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
985 {
986   PetscErrorCode ierr;
987   MatStashBlock *block;
988 
989   PetscFunctionBegin;
990   *flg = 0;
991   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
992     if (stash->some_i == stash->some_count) {
993       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
994       ierr = MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE);CHKERRQ(ierr);
995       stash->some_i = 0;
996     }
997     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
998     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
999     if (stash->use_status) { /* Count what was actually sent */
1000       ierr = MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count);CHKERRQ(ierr);
1001     }
1002     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
1003       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
1004       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
1005       if (PetscUnlikely(*stash->insertmode == INSERT_VALUES && block->row >= 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
1006       if (PetscUnlikely(*stash->insertmode == ADD_VALUES && block->row < 0)) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
1007     }
1008     stash->some_i++;
1009     stash->recvcount++;
1010     stash->recvframe_i = 0;
1011   }
1012   *n = 1;
1013   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
1014   if (block->row < 0) block->row = -(block->row + 1);
1015   *row = &block->row;
1016   *col = &block->col;
1017   *val = block->vals;
1018   stash->recvframe_i++;
1019   *flg = 1;
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 #undef __FUNCT__
1024 #define __FUNCT__ "MatStashScatterEnd_BTS"
1025 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
1026 {
1027   PetscErrorCode ierr;
1028 
1029   PetscFunctionBegin;
1030   ierr = MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE);CHKERRQ(ierr);
1031   if (stash->subset_off_proc) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
1032     void *dummy;
1033     ierr = PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy);CHKERRQ(ierr);
1034   } else {                      /* No reuse, so collect everything. */
1035     ierr = MatStashScatterDestroy_BTS(stash);CHKERRQ(ierr);
1036   }
1037 
1038   /* Now update nmaxold to be app 10% more than max n used, this way the
1039      wastage of space is reduced the next time this stash is used.
1040      Also update the oldmax, only if it increases */
1041   if (stash->n) {
1042     PetscInt bs2     = stash->bs*stash->bs;
1043     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
1044     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
1045   }
1046 
1047   stash->nmax       = 0;
1048   stash->n          = 0;
1049   stash->reallocs   = -1;
1050   stash->nprocessed = 0;
1051 
1052   ierr = PetscMatStashSpaceDestroy(&stash->space_head);CHKERRQ(ierr);
1053 
1054   stash->space = 0;
1055 
1056   PetscFunctionReturn(0);
1057 }
1058 
1059 #undef __FUNCT__
1060 #define __FUNCT__ "MatStashScatterDestroy_BTS"
1061 static PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1062 {
1063   PetscErrorCode ierr;
1064 
1065   PetscFunctionBegin;
1066   ierr = PetscSegBufferDestroy(&stash->segsendblocks);CHKERRQ(ierr);
1067   ierr = PetscSegBufferDestroy(&stash->segrecvframe);CHKERRQ(ierr);
1068   stash->recvframes = NULL;
1069   ierr = PetscSegBufferDestroy(&stash->segrecvblocks);CHKERRQ(ierr);
1070   if (stash->blocktype != MPI_DATATYPE_NULL) {
1071     ierr = MPI_Type_free(&stash->blocktype);CHKERRQ(ierr);
1072   }
1073   stash->nsendranks = 0;
1074   stash->nrecvranks = 0;
1075   ierr = PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes);CHKERRQ(ierr);
1076   ierr = PetscFree(stash->sendreqs);CHKERRQ(ierr);
1077   ierr = PetscFree(stash->recvreqs);CHKERRQ(ierr);
1078   ierr = PetscFree(stash->recvranks);CHKERRQ(ierr);
1079   ierr = PetscFree(stash->recvhdr);CHKERRQ(ierr);
1080   ierr = PetscFree2(stash->some_indices,stash->some_statuses);CHKERRQ(ierr);
1081   PetscFunctionReturn(0);
1082 }
1083