xref: /petsc/src/mat/utils/matstash.c (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 
2 #include <petsc/private/matimpl.h>
3 
4 #define DEFAULT_STASH_SIZE   10000
5 
6 static PetscErrorCode MatStashScatterBegin_Ref(Mat,MatStash*,PetscInt*);
7 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
8 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash*);
9 #if !defined(PETSC_HAVE_MPIUNI)
10 static PetscErrorCode MatStashScatterBegin_BTS(Mat,MatStash*,PetscInt*);
11 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash*,PetscMPIInt*,PetscInt**,PetscInt**,PetscScalar**,PetscInt*);
12 static PetscErrorCode MatStashScatterEnd_BTS(MatStash*);
13 #endif
14 
15 /*
16   MatStashCreate_Private - Creates a stash,currently used for all the parallel
17   matrix implementations. The stash is where elements of a matrix destined
18   to be stored on other processors are kept until matrix assembly is done.
19 
20   This is a simple minded stash. Simply adds entries to end of stash.
21 
22   Input Parameters:
23   comm - communicator, required for scatters.
24   bs   - stash block size. used when stashing blocks of values
25 
26   Output Parameters:
27   stash    - the newly created stash
28 */
29 PetscErrorCode MatStashCreate_Private(MPI_Comm comm,PetscInt bs,MatStash *stash)
30 {
31   PetscInt       max,*opt,nopt,i;
32   PetscBool      flg;
33 
34   PetscFunctionBegin;
35   /* Require 2 tags,get the second using PetscCommGetNewTag() */
36   stash->comm = comm;
37 
38   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag1));
39   PetscCall(PetscCommGetNewTag(stash->comm,&stash->tag2));
40   PetscCallMPI(MPI_Comm_size(stash->comm,&stash->size));
41   PetscCallMPI(MPI_Comm_rank(stash->comm,&stash->rank));
42   PetscCall(PetscMalloc1(2*stash->size,&stash->flg_v));
43   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
44 
45   nopt = stash->size;
46   PetscCall(PetscMalloc1(nopt,&opt));
47   PetscCall(PetscOptionsGetIntArray(NULL,NULL,"-matstash_initial_size",opt,&nopt,&flg));
48   if (flg) {
49     if (nopt == 1)                max = opt[0];
50     else if (nopt == stash->size) max = opt[stash->rank];
51     else if (stash->rank < nopt)  max = opt[stash->rank];
52     else                          max = 0; /* Use default */
53     stash->umax = max;
54   } else {
55     stash->umax = 0;
56   }
57   PetscCall(PetscFree(opt));
58   if (bs <= 0) bs = 1;
59 
60   stash->bs         = bs;
61   stash->nmax       = 0;
62   stash->oldnmax    = 0;
63   stash->n          = 0;
64   stash->reallocs   = -1;
65   stash->space_head = NULL;
66   stash->space      = NULL;
67 
68   stash->send_waits  = NULL;
69   stash->recv_waits  = NULL;
70   stash->send_status = NULL;
71   stash->nsends      = 0;
72   stash->nrecvs      = 0;
73   stash->svalues     = NULL;
74   stash->rvalues     = NULL;
75   stash->rindices    = NULL;
76   stash->nprocessed  = 0;
77   stash->reproduce   = PETSC_FALSE;
78   stash->blocktype   = MPI_DATATYPE_NULL;
79 
80   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_reproduce",&stash->reproduce,NULL));
81 #if !defined(PETSC_HAVE_MPIUNI)
82   flg  = PETSC_FALSE;
83   PetscCall(PetscOptionsGetBool(NULL,NULL,"-matstash_legacy",&flg,NULL));
84   if (!flg) {
85     stash->ScatterBegin   = MatStashScatterBegin_BTS;
86     stash->ScatterGetMesg = MatStashScatterGetMesg_BTS;
87     stash->ScatterEnd     = MatStashScatterEnd_BTS;
88     stash->ScatterDestroy = MatStashScatterDestroy_BTS;
89   } else {
90 #endif
91     stash->ScatterBegin   = MatStashScatterBegin_Ref;
92     stash->ScatterGetMesg = MatStashScatterGetMesg_Ref;
93     stash->ScatterEnd     = MatStashScatterEnd_Ref;
94     stash->ScatterDestroy = NULL;
95 #if !defined(PETSC_HAVE_MPIUNI)
96   }
97 #endif
98   PetscFunctionReturn(0);
99 }
100 
101 /*
102    MatStashDestroy_Private - Destroy the stash
103 */
104 PetscErrorCode MatStashDestroy_Private(MatStash *stash)
105 {
106   PetscFunctionBegin;
107   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
108   if (stash->ScatterDestroy) PetscCall((*stash->ScatterDestroy)(stash));
109 
110   stash->space = NULL;
111 
112   PetscCall(PetscFree(stash->flg_v));
113   PetscFunctionReturn(0);
114 }
115 
116 /*
117    MatStashScatterEnd_Private - This is called as the final stage of
118    scatter. The final stages of message passing is done here, and
119    all the memory used for message passing is cleaned up. This
120    routine also resets the stash, and deallocates the memory used
121    for the stash. It also keeps track of the current memory usage
122    so that the same value can be used the next time through.
123 */
124 PetscErrorCode MatStashScatterEnd_Private(MatStash *stash)
125 {
126   PetscFunctionBegin;
127   PetscCall((*stash->ScatterEnd)(stash));
128   PetscFunctionReturn(0);
129 }
130 
131 PETSC_INTERN PetscErrorCode MatStashScatterEnd_Ref(MatStash *stash)
132 {
133   PetscInt       nsends=stash->nsends,bs2,oldnmax,i;
134   MPI_Status     *send_status;
135 
136   PetscFunctionBegin;
137   for (i=0; i<2*stash->size; i++) stash->flg_v[i] = -1;
138   /* wait on sends */
139   if (nsends) {
140     PetscCall(PetscMalloc1(2*nsends,&send_status));
141     PetscCallMPI(MPI_Waitall(2*nsends,stash->send_waits,send_status));
142     PetscCall(PetscFree(send_status));
143   }
144 
145   /* Now update nmaxold to be app 10% more than max n used, this way the
146      wastage of space is reduced the next time this stash is used.
147      Also update the oldmax, only if it increases */
148   if (stash->n) {
149     bs2     = stash->bs*stash->bs;
150     oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
151     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
152   }
153 
154   stash->nmax       = 0;
155   stash->n          = 0;
156   stash->reallocs   = -1;
157   stash->nprocessed = 0;
158 
159   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
160 
161   stash->space = NULL;
162 
163   PetscCall(PetscFree(stash->send_waits));
164   PetscCall(PetscFree(stash->recv_waits));
165   PetscCall(PetscFree2(stash->svalues,stash->sindices));
166   PetscCall(PetscFree(stash->rvalues[0]));
167   PetscCall(PetscFree(stash->rvalues));
168   PetscCall(PetscFree(stash->rindices[0]));
169   PetscCall(PetscFree(stash->rindices));
170   PetscFunctionReturn(0);
171 }
172 
173 /*
174    MatStashGetInfo_Private - Gets the relavant statistics of the stash
175 
176    Input Parameters:
177    stash    - the stash
178    nstash   - the size of the stash. Indicates the number of values stored.
179    reallocs - the number of additional mallocs incurred.
180 
181 */
182 PetscErrorCode MatStashGetInfo_Private(MatStash *stash,PetscInt *nstash,PetscInt *reallocs)
183 {
184   PetscInt bs2 = stash->bs*stash->bs;
185 
186   PetscFunctionBegin;
187   if (nstash) *nstash = stash->n*bs2;
188   if (reallocs) {
189     if (stash->reallocs < 0) *reallocs = 0;
190     else                     *reallocs = stash->reallocs;
191   }
192   PetscFunctionReturn(0);
193 }
194 
195 /*
196    MatStashSetInitialSize_Private - Sets the initial size of the stash
197 
198    Input Parameters:
199    stash  - the stash
200    max    - the value that is used as the max size of the stash.
201             this value is used while allocating memory.
202 */
203 PetscErrorCode MatStashSetInitialSize_Private(MatStash *stash,PetscInt max)
204 {
205   PetscFunctionBegin;
206   stash->umax = max;
207   PetscFunctionReturn(0);
208 }
209 
210 /* MatStashExpand_Private - Expand the stash. This function is called
211    when the space in the stash is not sufficient to add the new values
212    being inserted into the stash.
213 
214    Input Parameters:
215    stash - the stash
216    incr  - the minimum increase requested
217 
218    Notes:
219    This routine doubles the currently used memory.
220  */
221 static PetscErrorCode MatStashExpand_Private(MatStash *stash,PetscInt incr)
222 {
223   PetscInt       newnmax,bs2= stash->bs*stash->bs;
224 
225   PetscFunctionBegin;
226   /* allocate a larger stash */
227   if (!stash->oldnmax && !stash->nmax) { /* new stash */
228     if (stash->umax)                  newnmax = stash->umax/bs2;
229     else                              newnmax = DEFAULT_STASH_SIZE/bs2;
230   } else if (!stash->nmax) { /* resuing stash */
231     if (stash->umax > stash->oldnmax) newnmax = stash->umax/bs2;
232     else                              newnmax = stash->oldnmax/bs2;
233   } else                              newnmax = stash->nmax*2;
234   if (newnmax  < (stash->nmax + incr)) newnmax += 2*incr;
235 
236   /* Get a MatStashSpace and attach it to stash */
237   PetscCall(PetscMatStashSpaceGet(bs2,newnmax,&stash->space));
238   if (!stash->space_head) { /* new stash or resuing stash->oldnmax */
239     stash->space_head = stash->space;
240   }
241 
242   stash->reallocs++;
243   stash->nmax = newnmax;
244   PetscFunctionReturn(0);
245 }
246 /*
247   MatStashValuesRow_Private - inserts values into the stash. This function
248   expects the values to be roworiented. Multiple columns belong to the same row
249   can be inserted with a single call to this function.
250 
251   Input Parameters:
252   stash  - the stash
253   row    - the global row correspoiding to the values
254   n      - the number of elements inserted. All elements belong to the above row.
255   idxn   - the global column indices corresponding to each of the values.
256   values - the values inserted
257 */
258 PetscErrorCode MatStashValuesRow_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscBool ignorezeroentries)
259 {
260   PetscInt           i,k,cnt = 0;
261   PetscMatStashSpace space=stash->space;
262 
263   PetscFunctionBegin;
264   /* Check and see if we have sufficient memory */
265   if (!space || space->local_remaining < n) {
266     PetscCall(MatStashExpand_Private(stash,n));
267   }
268   space = stash->space;
269   k     = space->local_used;
270   for (i=0; i<n; i++) {
271     if (ignorezeroentries && values && values[i] == 0.0) continue;
272     space->idx[k] = row;
273     space->idy[k] = idxn[i];
274     space->val[k] = values ? values[i] : 0.0;
275     k++;
276     cnt++;
277   }
278   stash->n               += cnt;
279   space->local_used      += cnt;
280   space->local_remaining -= cnt;
281   PetscFunctionReturn(0);
282 }
283 
284 /*
285   MatStashValuesCol_Private - inserts values into the stash. This function
286   expects the values to be columnoriented. Multiple columns belong to the same row
287   can be inserted with a single call to this function.
288 
289   Input Parameters:
290   stash   - the stash
291   row     - the global row correspoiding to the values
292   n       - the number of elements inserted. All elements belong to the above row.
293   idxn    - the global column indices corresponding to each of the values.
294   values  - the values inserted
295   stepval - the consecutive values are sepated by a distance of stepval.
296             this happens because the input is columnoriented.
297 */
298 PetscErrorCode MatStashValuesCol_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt stepval,PetscBool ignorezeroentries)
299 {
300   PetscInt           i,k,cnt = 0;
301   PetscMatStashSpace space=stash->space;
302 
303   PetscFunctionBegin;
304   /* Check and see if we have sufficient memory */
305   if (!space || space->local_remaining < n) {
306     PetscCall(MatStashExpand_Private(stash,n));
307   }
308   space = stash->space;
309   k     = space->local_used;
310   for (i=0; i<n; i++) {
311     if (ignorezeroentries && values && values[i*stepval] == 0.0) continue;
312     space->idx[k] = row;
313     space->idy[k] = idxn[i];
314     space->val[k] = values ? values[i*stepval] : 0.0;
315     k++;
316     cnt++;
317   }
318   stash->n               += cnt;
319   space->local_used      += cnt;
320   space->local_remaining -= cnt;
321   PetscFunctionReturn(0);
322 }
323 
324 /*
325   MatStashValuesRowBlocked_Private - inserts blocks of values into the stash.
326   This function expects the values to be roworiented. Multiple columns belong
327   to the same block-row can be inserted with a single call to this function.
328   This function extracts the sub-block of values based on the dimensions of
329   the original input block, and the row,col values corresponding to the blocks.
330 
331   Input Parameters:
332   stash  - the stash
333   row    - the global block-row correspoiding to the values
334   n      - the number of elements inserted. All elements belong to the above row.
335   idxn   - the global block-column indices corresponding to each of the blocks of
336            values. Each block is of size bs*bs.
337   values - the values inserted
338   rmax   - the number of block-rows in the original block.
339   cmax   - the number of block-columns on the original block.
340   idx    - the index of the current block-row in the original block.
341 */
342 PetscErrorCode MatStashValuesRowBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
343 {
344   PetscInt           i,j,k,bs2,bs=stash->bs,l;
345   const PetscScalar  *vals;
346   PetscScalar        *array;
347   PetscMatStashSpace space=stash->space;
348 
349   PetscFunctionBegin;
350   if (!space || space->local_remaining < n) {
351     PetscCall(MatStashExpand_Private(stash,n));
352   }
353   space = stash->space;
354   l     = space->local_used;
355   bs2   = bs*bs;
356   for (i=0; i<n; i++) {
357     space->idx[l] = row;
358     space->idy[l] = idxn[i];
359     /* Now copy over the block of values. Store the values column oriented.
360        This enables inserting multiple blocks belonging to a row with a single
361        funtion call */
362     array = space->val + bs2*l;
363     vals  = values + idx*bs2*n + bs*i;
364     for (j=0; j<bs; j++) {
365       for (k=0; k<bs; k++) array[k*bs] = values ? vals[k] : 0.0;
366       array++;
367       vals += cmax*bs;
368     }
369     l++;
370   }
371   stash->n               += n;
372   space->local_used      += n;
373   space->local_remaining -= n;
374   PetscFunctionReturn(0);
375 }
376 
377 /*
378   MatStashValuesColBlocked_Private - inserts blocks of values into the stash.
379   This function expects the values to be roworiented. Multiple columns belong
380   to the same block-row can be inserted with a single call to this function.
381   This function extracts the sub-block of values based on the dimensions of
382   the original input block, and the row,col values corresponding to the blocks.
383 
384   Input Parameters:
385   stash  - the stash
386   row    - the global block-row correspoiding to the values
387   n      - the number of elements inserted. All elements belong to the above row.
388   idxn   - the global block-column indices corresponding to each of the blocks of
389            values. Each block is of size bs*bs.
390   values - the values inserted
391   rmax   - the number of block-rows in the original block.
392   cmax   - the number of block-columns on the original block.
393   idx    - the index of the current block-row in the original block.
394 */
395 PetscErrorCode MatStashValuesColBlocked_Private(MatStash *stash,PetscInt row,PetscInt n,const PetscInt idxn[],const PetscScalar values[],PetscInt rmax,PetscInt cmax,PetscInt idx)
396 {
397   PetscInt           i,j,k,bs2,bs=stash->bs,l;
398   const PetscScalar  *vals;
399   PetscScalar        *array;
400   PetscMatStashSpace space=stash->space;
401 
402   PetscFunctionBegin;
403   if (!space || space->local_remaining < n) {
404     PetscCall(MatStashExpand_Private(stash,n));
405   }
406   space = stash->space;
407   l     = space->local_used;
408   bs2   = bs*bs;
409   for (i=0; i<n; i++) {
410     space->idx[l] = row;
411     space->idy[l] = idxn[i];
412     /* Now copy over the block of values. Store the values column oriented.
413      This enables inserting multiple blocks belonging to a row with a single
414      funtion call */
415     array = space->val + bs2*l;
416     vals  = values + idx*bs2*n + bs*i;
417     for (j=0; j<bs; j++) {
418       for (k=0; k<bs; k++) array[k] = values ? vals[k] : 0.0;
419       array += bs;
420       vals  += rmax*bs;
421     }
422     l++;
423   }
424   stash->n               += n;
425   space->local_used      += n;
426   space->local_remaining -= n;
427   PetscFunctionReturn(0);
428 }
429 /*
430   MatStashScatterBegin_Private - Initiates the transfer of values to the
431   correct owners. This function goes through the stash, and check the
432   owners of each stashed value, and sends the values off to the owner
433   processors.
434 
435   Input Parameters:
436   stash  - the stash
437   owners - an array of size 'no-of-procs' which gives the ownership range
438            for each node.
439 
440   Notes:
441     The 'owners' array in the cased of the blocked-stash has the
442   ranges specified blocked global indices, and for the regular stash in
443   the proper global indices.
444 */
445 PetscErrorCode MatStashScatterBegin_Private(Mat mat,MatStash *stash,PetscInt *owners)
446 {
447   PetscFunctionBegin;
448   PetscCall((*stash->ScatterBegin)(mat,stash,owners));
449   PetscFunctionReturn(0);
450 }
451 
452 static PetscErrorCode MatStashScatterBegin_Ref(Mat mat,MatStash *stash,PetscInt *owners)
453 {
454   PetscInt           *owner,*startv,*starti,tag1=stash->tag1,tag2=stash->tag2,bs2;
455   PetscInt           size=stash->size,nsends;
456   PetscInt           count,*sindices,**rindices,i,j,idx,lastidx,l;
457   PetscScalar        **rvalues,*svalues;
458   MPI_Comm           comm = stash->comm;
459   MPI_Request        *send_waits,*recv_waits,*recv_waits1,*recv_waits2;
460   PetscMPIInt        *sizes,*nlengths,nreceives;
461   PetscInt           *sp_idx,*sp_idy;
462   PetscScalar        *sp_val;
463   PetscMatStashSpace space,space_next;
464 
465   PetscFunctionBegin;
466   {                             /* make sure all processors are either in INSERTMODE or ADDMODE */
467     InsertMode addv;
468     PetscCallMPI(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
469     PetscCheckFalse(addv == (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
470     mat->insertmode = addv; /* in case this processor had no cache */
471   }
472 
473   bs2 = stash->bs*stash->bs;
474 
475   /*  first count number of contributors to each processor */
476   PetscCall(PetscCalloc1(size,&nlengths));
477   PetscCall(PetscMalloc1(stash->n+1,&owner));
478 
479   i       = j    = 0;
480   lastidx = -1;
481   space   = stash->space_head;
482   while (space) {
483     space_next = space->next;
484     sp_idx     = space->idx;
485     for (l=0; l<space->local_used; l++) {
486       /* if indices are NOT locally sorted, need to start search at the beginning */
487       if (lastidx > (idx = sp_idx[l])) j = 0;
488       lastidx = idx;
489       for (; j<size; j++) {
490         if (idx >= owners[j] && idx < owners[j+1]) {
491           nlengths[j]++; owner[i] = j; break;
492         }
493       }
494       i++;
495     }
496     space = space_next;
497   }
498 
499   /* Now check what procs get messages - and compute nsends. */
500   PetscCall(PetscCalloc1(size,&sizes));
501   for (i=0, nsends=0; i<size; i++) {
502     if (nlengths[i]) {
503       sizes[i] = 1; nsends++;
504     }
505   }
506 
507   {PetscMPIInt *onodes,*olengths;
508    /* Determine the number of messages to expect, their lengths, from from-ids */
509    PetscCall(PetscGatherNumberOfMessages(comm,sizes,nlengths,&nreceives));
510    PetscCall(PetscGatherMessageLengths(comm,nsends,nreceives,nlengths,&onodes,&olengths));
511    /* since clubbing row,col - lengths are multiplied by 2 */
512    for (i=0; i<nreceives; i++) olengths[i] *=2;
513    PetscCall(PetscPostIrecvInt(comm,tag1,nreceives,onodes,olengths,&rindices,&recv_waits1));
514    /* values are size 'bs2' lengths (and remove earlier factor 2 */
515    for (i=0; i<nreceives; i++) olengths[i] = olengths[i]*bs2/2;
516    PetscCall(PetscPostIrecvScalar(comm,tag2,nreceives,onodes,olengths,&rvalues,&recv_waits2));
517    PetscCall(PetscFree(onodes));
518    PetscCall(PetscFree(olengths));}
519 
520   /* do sends:
521       1) starts[i] gives the starting index in svalues for stuff going to
522          the ith processor
523   */
524   PetscCall(PetscMalloc2(bs2*stash->n,&svalues,2*(stash->n+1),&sindices));
525   PetscCall(PetscMalloc1(2*nsends,&send_waits));
526   PetscCall(PetscMalloc2(size,&startv,size,&starti));
527   /* use 2 sends the first with all_a, the next with all_i and all_j */
528   startv[0] = 0; starti[0] = 0;
529   for (i=1; i<size; i++) {
530     startv[i] = startv[i-1] + nlengths[i-1];
531     starti[i] = starti[i-1] + 2*nlengths[i-1];
532   }
533 
534   i     = 0;
535   space = stash->space_head;
536   while (space) {
537     space_next = space->next;
538     sp_idx     = space->idx;
539     sp_idy     = space->idy;
540     sp_val     = space->val;
541     for (l=0; l<space->local_used; l++) {
542       j = owner[i];
543       if (bs2 == 1) {
544         svalues[startv[j]] = sp_val[l];
545       } else {
546         PetscInt    k;
547         PetscScalar *buf1,*buf2;
548         buf1 = svalues+bs2*startv[j];
549         buf2 = space->val + bs2*l;
550         for (k=0; k<bs2; k++) buf1[k] = buf2[k];
551       }
552       sindices[starti[j]]             = sp_idx[l];
553       sindices[starti[j]+nlengths[j]] = sp_idy[l];
554       startv[j]++;
555       starti[j]++;
556       i++;
557     }
558     space = space_next;
559   }
560   startv[0] = 0;
561   for (i=1; i<size; i++) startv[i] = startv[i-1] + nlengths[i-1];
562 
563   for (i=0,count=0; i<size; i++) {
564     if (sizes[i]) {
565       PetscCallMPI(MPI_Isend(sindices+2*startv[i],2*nlengths[i],MPIU_INT,i,tag1,comm,send_waits+count++));
566       PetscCallMPI(MPI_Isend(svalues+bs2*startv[i],bs2*nlengths[i],MPIU_SCALAR,i,tag2,comm,send_waits+count++));
567     }
568   }
569 #if defined(PETSC_USE_INFO)
570   PetscCall(PetscInfo(NULL,"No of messages: %" PetscInt_FMT " \n",nsends));
571   for (i=0; i<size; i++) {
572     if (sizes[i]) {
573       PetscCall(PetscInfo(NULL,"Mesg_to: %" PetscInt_FMT ": size: %zu bytes\n",i,(size_t)(nlengths[i]*(bs2*sizeof(PetscScalar)+2*sizeof(PetscInt)))));
574     }
575   }
576 #endif
577   PetscCall(PetscFree(nlengths));
578   PetscCall(PetscFree(owner));
579   PetscCall(PetscFree2(startv,starti));
580   PetscCall(PetscFree(sizes));
581 
582   /* recv_waits need to be contiguous for MatStashScatterGetMesg_Private() */
583   PetscCall(PetscMalloc1(2*nreceives,&recv_waits));
584 
585   for (i=0; i<nreceives; i++) {
586     recv_waits[2*i]   = recv_waits1[i];
587     recv_waits[2*i+1] = recv_waits2[i];
588   }
589   stash->recv_waits = recv_waits;
590 
591   PetscCall(PetscFree(recv_waits1));
592   PetscCall(PetscFree(recv_waits2));
593 
594   stash->svalues         = svalues;
595   stash->sindices        = sindices;
596   stash->rvalues         = rvalues;
597   stash->rindices        = rindices;
598   stash->send_waits      = send_waits;
599   stash->nsends          = nsends;
600   stash->nrecvs          = nreceives;
601   stash->reproduce_count = 0;
602   PetscFunctionReturn(0);
603 }
604 
605 /*
606    MatStashScatterGetMesg_Private - This function waits on the receives posted
607    in the function MatStashScatterBegin_Private() and returns one message at
608    a time to the calling function. If no messages are left, it indicates this
609    by setting flg = 0, else it sets flg = 1.
610 
611    Input Parameters:
612    stash - the stash
613 
614    Output Parameters:
615    nvals - the number of entries in the current message.
616    rows  - an array of row indices (or blocked indices) corresponding to the values
617    cols  - an array of columnindices (or blocked indices) corresponding to the values
618    vals  - the values
619    flg   - 0 indicates no more message left, and the current call has no values associated.
620            1 indicates that the current call successfully received a message, and the
621              other output parameters nvals,rows,cols,vals are set appropriately.
622 */
623 PetscErrorCode MatStashScatterGetMesg_Private(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
624 {
625   PetscFunctionBegin;
626   PetscCall((*stash->ScatterGetMesg)(stash,nvals,rows,cols,vals,flg));
627   PetscFunctionReturn(0);
628 }
629 
630 PETSC_INTERN PetscErrorCode MatStashScatterGetMesg_Ref(MatStash *stash,PetscMPIInt *nvals,PetscInt **rows,PetscInt **cols,PetscScalar **vals,PetscInt *flg)
631 {
632   PetscMPIInt    i,*flg_v = stash->flg_v,i1,i2;
633   PetscInt       bs2;
634   MPI_Status     recv_status;
635   PetscBool      match_found = PETSC_FALSE;
636 
637   PetscFunctionBegin;
638   *flg = 0; /* When a message is discovered this is reset to 1 */
639   /* Return if no more messages to process */
640   if (stash->nprocessed == stash->nrecvs) PetscFunctionReturn(0);
641 
642   bs2 = stash->bs*stash->bs;
643   /* If a matching pair of receives are found, process them, and return the data to
644      the calling function. Until then keep receiving messages */
645   while (!match_found) {
646     if (stash->reproduce) {
647       i    = stash->reproduce_count++;
648       PetscCallMPI(MPI_Wait(stash->recv_waits+i,&recv_status));
649     } else {
650       PetscCallMPI(MPI_Waitany(2*stash->nrecvs,stash->recv_waits,&i,&recv_status));
651     }
652     PetscCheckFalse(recv_status.MPI_SOURCE < 0,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Negative MPI source!");
653 
654     /* Now pack the received message into a structure which is usable by others */
655     if (i % 2) {
656       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_SCALAR,nvals));
657 
658       flg_v[2*recv_status.MPI_SOURCE] = i/2;
659 
660       *nvals = *nvals/bs2;
661     } else {
662       PetscCallMPI(MPI_Get_count(&recv_status,MPIU_INT,nvals));
663 
664       flg_v[2*recv_status.MPI_SOURCE+1] = i/2;
665 
666       *nvals = *nvals/2; /* This message has both row indices and col indices */
667     }
668 
669     /* Check if we have both messages from this proc */
670     i1 = flg_v[2*recv_status.MPI_SOURCE];
671     i2 = flg_v[2*recv_status.MPI_SOURCE+1];
672     if (i1 != -1 && i2 != -1) {
673       *rows = stash->rindices[i2];
674       *cols = *rows + *nvals;
675       *vals = stash->rvalues[i1];
676       *flg  = 1;
677       stash->nprocessed++;
678       match_found = PETSC_TRUE;
679     }
680   }
681   PetscFunctionReturn(0);
682 }
683 
684 #if !defined(PETSC_HAVE_MPIUNI)
685 typedef struct {
686   PetscInt row;
687   PetscInt col;
688   PetscScalar vals[1];          /* Actually an array of length bs2 */
689 } MatStashBlock;
690 
691 static PetscErrorCode MatStashSortCompress_Private(MatStash *stash,InsertMode insertmode)
692 {
693   PetscMatStashSpace space;
694   PetscInt n = stash->n,bs = stash->bs,bs2 = bs*bs,cnt,*row,*col,*perm,rowstart,i;
695   PetscScalar **valptr;
696 
697   PetscFunctionBegin;
698   PetscCall(PetscMalloc4(n,&row,n,&col,n,&valptr,n,&perm));
699   for (space=stash->space_head,cnt=0; space; space=space->next) {
700     for (i=0; i<space->local_used; i++) {
701       row[cnt] = space->idx[i];
702       col[cnt] = space->idy[i];
703       valptr[cnt] = &space->val[i*bs2];
704       perm[cnt] = cnt;          /* Will tell us where to find valptr after sorting row[] and col[] */
705       cnt++;
706     }
707   }
708   PetscCheckFalse(cnt != n,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MatStash n %" PetscInt_FMT ", but counted %" PetscInt_FMT " entries",n,cnt);
709   PetscCall(PetscSortIntWithArrayPair(n,row,col,perm));
710   /* Scan through the rows, sorting each one, combining duplicates, and packing send buffers */
711   for (rowstart=0,cnt=0,i=1; i<=n; i++) {
712     if (i == n || row[i] != row[rowstart]) {         /* Sort the last row. */
713       PetscInt colstart;
714       PetscCall(PetscSortIntWithArray(i-rowstart,&col[rowstart],&perm[rowstart]));
715       for (colstart=rowstart; colstart<i;) { /* Compress multiple insertions to the same location */
716         PetscInt j,l;
717         MatStashBlock *block;
718         PetscCall(PetscSegBufferGet(stash->segsendblocks,1,&block));
719         block->row = row[rowstart];
720         block->col = col[colstart];
721         PetscCall(PetscArraycpy(block->vals,valptr[perm[colstart]],bs2));
722         for (j=colstart+1; j<i && col[j] == col[colstart]; j++) { /* Add any extra stashed blocks at the same (row,col) */
723           if (insertmode == ADD_VALUES) {
724             for (l=0; l<bs2; l++) block->vals[l] += valptr[perm[j]][l];
725           } else {
726             PetscCall(PetscArraycpy(block->vals,valptr[perm[j]],bs2));
727           }
728         }
729         colstart = j;
730       }
731       rowstart = i;
732     }
733   }
734   PetscCall(PetscFree4(row,col,valptr,perm));
735   PetscFunctionReturn(0);
736 }
737 
738 static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash)
739 {
740   PetscFunctionBegin;
741   if (stash->blocktype == MPI_DATATYPE_NULL) {
742     PetscInt     bs2 = PetscSqr(stash->bs);
743     PetscMPIInt  blocklens[2];
744     MPI_Aint     displs[2];
745     MPI_Datatype types[2],stype;
746     /* Note that DummyBlock is a type having standard layout, even when PetscScalar is C++ std::complex.
747        std::complex itself has standard layout, so does DummyBlock, recursively.
748        To be compatible with C++ std::complex, complex implementations on GPUs must also have standard layout,
749        though they can have different alignment, e.g, 16 bytes for double complex, instead of 8 bytes as in GCC stdlibc++.
750        offsetof(type, member) only requires type to have standard layout. Ref. https://en.cppreference.com/w/cpp/types/offsetof.
751 
752        We can test if std::complex has standard layout with the following code:
753        #include <iostream>
754        #include <type_traits>
755        #include <complex>
756        int main() {
757          std::cout << std::boolalpha;
758          std::cout << std::is_standard_layout<std::complex<double> >::value << '\n';
759        }
760        Output: true
761      */
762     struct DummyBlock {PetscInt row,col; PetscScalar vals;};
763 
764     stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar);
765     if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */
766       stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt);
767     }
768     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks));
769     PetscCall(PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks));
770     PetscCall(PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe));
771     blocklens[0] = 2;
772     blocklens[1] = bs2;
773     displs[0] = offsetof(struct DummyBlock,row);
774     displs[1] = offsetof(struct DummyBlock,vals);
775     types[0] = MPIU_INT;
776     types[1] = MPIU_SCALAR;
777     PetscCallMPI(MPI_Type_create_struct(2,blocklens,displs,types,&stype));
778     PetscCallMPI(MPI_Type_commit(&stype));
779     PetscCallMPI(MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype));
780     PetscCallMPI(MPI_Type_commit(&stash->blocktype));
781     PetscCallMPI(MPI_Type_free(&stype));
782   }
783   PetscFunctionReturn(0);
784 }
785 
786 /* Callback invoked after target rank has initiatied receive of rendezvous message.
787  * Here we post the main sends.
788  */
789 static PetscErrorCode MatStashBTSSend_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rankid,PetscMPIInt rank,void *sdata,MPI_Request req[],void *ctx)
790 {
791   MatStash *stash = (MatStash*)ctx;
792   MatStashHeader *hdr = (MatStashHeader*)sdata;
793 
794   PetscFunctionBegin;
795   PetscCheckFalse(rank != stash->sendranks[rankid],comm,PETSC_ERR_PLIB,"BTS Send rank %d does not match sendranks[%d] %d",rank,rankid,stash->sendranks[rankid]);
796   PetscCallMPI(MPI_Isend(stash->sendframes[rankid].buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
797   stash->sendframes[rankid].count = hdr->count;
798   stash->sendframes[rankid].pending = 1;
799   PetscFunctionReturn(0);
800 }
801 
802 /* Callback invoked by target after receiving rendezvous message.
803  * Here we post the main recvs.
804  */
805 static PetscErrorCode MatStashBTSRecv_Private(MPI_Comm comm,const PetscMPIInt tag[],PetscMPIInt rank,void *rdata,MPI_Request req[],void *ctx)
806 {
807   MatStash *stash = (MatStash*)ctx;
808   MatStashHeader *hdr = (MatStashHeader*)rdata;
809   MatStashFrame *frame;
810 
811   PetscFunctionBegin;
812   PetscCall(PetscSegBufferGet(stash->segrecvframe,1,&frame));
813   PetscCall(PetscSegBufferGet(stash->segrecvblocks,hdr->count,&frame->buffer));
814   PetscCallMPI(MPI_Irecv(frame->buffer,hdr->count,stash->blocktype,rank,tag[0],comm,&req[0]));
815   frame->count = hdr->count;
816   frame->pending = 1;
817   PetscFunctionReturn(0);
818 }
819 
820 /*
821  * owners[] contains the ownership ranges; may be indexed by either blocks or scalars
822  */
823 static PetscErrorCode MatStashScatterBegin_BTS(Mat mat,MatStash *stash,PetscInt owners[])
824 {
825   PetscErrorCode ierr;
826   size_t nblocks;
827   char *sendblocks;
828 
829   PetscFunctionBegin;
830   if (PetscDefined(USE_DEBUG)) { /* make sure all processors are either in INSERTMODE or ADDMODE */
831     InsertMode addv;
832     PetscCallMPI(MPIU_Allreduce((PetscEnum*)&mat->insertmode,(PetscEnum*)&addv,1,MPIU_ENUM,MPI_BOR,PetscObjectComm((PetscObject)mat)));
833     PetscCheckFalse(addv == (ADD_VALUES|INSERT_VALUES),PetscObjectComm((PetscObject)mat),PETSC_ERR_ARG_WRONGSTATE,"Some processors inserted others added");
834   }
835 
836   PetscCall(MatStashBlockTypeSetUp(stash));
837   PetscCall(MatStashSortCompress_Private(stash,mat->insertmode));
838   PetscCall(PetscSegBufferGetSize(stash->segsendblocks,&nblocks));
839   PetscCall(PetscSegBufferExtractInPlace(stash->segsendblocks,&sendblocks));
840   if (stash->first_assembly_done) { /* Set up sendhdrs and sendframes for each rank that we sent before */
841     PetscInt i;
842     size_t b;
843     for (i=0,b=0; i<stash->nsendranks; i++) {
844       stash->sendframes[i].buffer = &sendblocks[b*stash->blocktype_size];
845       /* sendhdr is never actually sent, but the count is used by MatStashBTSSend_Private */
846       stash->sendhdr[i].count = 0; /* Might remain empty (in which case we send a zero-sized message) if no values are communicated to that process */
847       for (; b<nblocks; b++) {
848         MatStashBlock *sendblock_b = (MatStashBlock*)&sendblocks[b*stash->blocktype_size];
849         PetscCheckFalse(sendblock_b->row < owners[stash->sendranks[i]],stash->comm,PETSC_ERR_ARG_WRONG,"MAT_SUBSET_OFF_PROC_ENTRIES set, but row %" PetscInt_FMT " owned by %d not communicated in initial assembly",sendblock_b->row,stash->sendranks[i]);
850         if (sendblock_b->row >= owners[stash->sendranks[i]+1]) break;
851         stash->sendhdr[i].count++;
852       }
853     }
854   } else {                      /* Dynamically count and pack (first time) */
855     PetscInt sendno;
856     size_t i,rowstart;
857 
858     /* Count number of send ranks and allocate for sends */
859     stash->nsendranks = 0;
860     for (rowstart=0; rowstart<nblocks;) {
861       PetscInt owner;
862       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
863       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
864       if (owner < 0) owner = -(owner+2);
865       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
866         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
867         if (sendblock_i->row >= owners[owner+1]) break;
868       }
869       stash->nsendranks++;
870       rowstart = i;
871     }
872     PetscCall(PetscMalloc3(stash->nsendranks,&stash->sendranks,stash->nsendranks,&stash->sendhdr,stash->nsendranks,&stash->sendframes));
873 
874     /* Set up sendhdrs and sendframes */
875     sendno = 0;
876     for (rowstart=0; rowstart<nblocks;) {
877       PetscInt owner;
878       MatStashBlock *sendblock_rowstart = (MatStashBlock*)&sendblocks[rowstart*stash->blocktype_size];
879       PetscCall(PetscFindInt(sendblock_rowstart->row,stash->size+1,owners,&owner));
880       if (owner < 0) owner = -(owner+2);
881       stash->sendranks[sendno] = owner;
882       for (i=rowstart+1; i<nblocks; i++) { /* Move forward through a run of blocks with the same owner */
883         MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
884         if (sendblock_i->row >= owners[owner+1]) break;
885       }
886       stash->sendframes[sendno].buffer = sendblock_rowstart;
887       stash->sendframes[sendno].pending = 0;
888       stash->sendhdr[sendno].count = i - rowstart;
889       sendno++;
890       rowstart = i;
891     }
892     PetscCheckFalse(sendno != stash->nsendranks,stash->comm,PETSC_ERR_PLIB,"BTS counted %d sendranks, but %" PetscInt_FMT " sends",stash->nsendranks,sendno);
893   }
894 
895   /* Encode insertmode on the outgoing messages. If we want to support more than two options, we would need a new
896    * message or a dummy entry of some sort. */
897   if (mat->insertmode == INSERT_VALUES) {
898     size_t i;
899     for (i=0; i<nblocks; i++) {
900       MatStashBlock *sendblock_i = (MatStashBlock*)&sendblocks[i*stash->blocktype_size];
901       sendblock_i->row = -(sendblock_i->row+1);
902     }
903   }
904 
905   if (stash->first_assembly_done) {
906     PetscMPIInt i,tag;
907     PetscCall(PetscCommGetNewTag(stash->comm,&tag));
908     for (i=0; i<stash->nrecvranks; i++) {
909       PetscCall(MatStashBTSRecv_Private(stash->comm,&tag,stash->recvranks[i],&stash->recvhdr[i],&stash->recvreqs[i],stash));
910     }
911     for (i=0; i<stash->nsendranks; i++) {
912       PetscCall(MatStashBTSSend_Private(stash->comm,&tag,i,stash->sendranks[i],&stash->sendhdr[i],&stash->sendreqs[i],stash));
913     }
914     stash->use_status = PETSC_TRUE; /* Use count from message status. */
915   } else {
916     ierr = PetscCommBuildTwoSidedFReq(stash->comm,1,MPIU_INT,stash->nsendranks,stash->sendranks,(PetscInt*)stash->sendhdr,
917                                       &stash->nrecvranks,&stash->recvranks,(PetscInt*)&stash->recvhdr,1,&stash->sendreqs,&stash->recvreqs,
918                                       MatStashBTSSend_Private,MatStashBTSRecv_Private,stash);PetscCall(ierr);
919     PetscCall(PetscMalloc2(stash->nrecvranks,&stash->some_indices,stash->nrecvranks,&stash->some_statuses));
920     stash->use_status = PETSC_FALSE; /* Use count from header instead of from message. */
921   }
922 
923   PetscCall(PetscSegBufferExtractInPlace(stash->segrecvframe,&stash->recvframes));
924   stash->recvframe_active     = NULL;
925   stash->recvframe_i          = 0;
926   stash->some_i               = 0;
927   stash->some_count           = 0;
928   stash->recvcount            = 0;
929   stash->first_assembly_done  = mat->assembly_subset; /* See the same logic in VecAssemblyBegin_MPI_BTS */
930   stash->insertmode           = &mat->insertmode;
931   PetscFunctionReturn(0);
932 }
933 
934 static PetscErrorCode MatStashScatterGetMesg_BTS(MatStash *stash,PetscMPIInt *n,PetscInt **row,PetscInt **col,PetscScalar **val,PetscInt *flg)
935 {
936   MatStashBlock *block;
937 
938   PetscFunctionBegin;
939   *flg = 0;
940   while (!stash->recvframe_active || stash->recvframe_i == stash->recvframe_count) {
941     if (stash->some_i == stash->some_count) {
942       if (stash->recvcount == stash->nrecvranks) PetscFunctionReturn(0); /* Done */
943       PetscCallMPI(MPI_Waitsome(stash->nrecvranks,stash->recvreqs,&stash->some_count,stash->some_indices,stash->use_status?stash->some_statuses:MPI_STATUSES_IGNORE));
944       stash->some_i = 0;
945     }
946     stash->recvframe_active = &stash->recvframes[stash->some_indices[stash->some_i]];
947     stash->recvframe_count = stash->recvframe_active->count; /* From header; maximum count */
948     if (stash->use_status) { /* Count what was actually sent */
949       PetscCallMPI(MPI_Get_count(&stash->some_statuses[stash->some_i],stash->blocktype,&stash->recvframe_count));
950     }
951     if (stash->recvframe_count > 0) { /* Check for InsertMode consistency */
952       block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[0];
953       if (PetscUnlikely(*stash->insertmode == NOT_SET_VALUES)) *stash->insertmode = block->row < 0 ? INSERT_VALUES : ADD_VALUES;
954       PetscCheckFalse(*stash->insertmode == INSERT_VALUES && block->row >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling INSERT_VALUES, but rank %d requested ADD_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
955       PetscCheckFalse(*stash->insertmode == ADD_VALUES && block->row < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Assembling ADD_VALUES, but rank %d requested INSERT_VALUES",stash->recvranks[stash->some_indices[stash->some_i]]);
956     }
957     stash->some_i++;
958     stash->recvcount++;
959     stash->recvframe_i = 0;
960   }
961   *n = 1;
962   block = (MatStashBlock*)&((char*)stash->recvframe_active->buffer)[stash->recvframe_i*stash->blocktype_size];
963   if (block->row < 0) block->row = -(block->row + 1);
964   *row = &block->row;
965   *col = &block->col;
966   *val = block->vals;
967   stash->recvframe_i++;
968   *flg = 1;
969   PetscFunctionReturn(0);
970 }
971 
972 static PetscErrorCode MatStashScatterEnd_BTS(MatStash *stash)
973 {
974   PetscFunctionBegin;
975   PetscCallMPI(MPI_Waitall(stash->nsendranks,stash->sendreqs,MPI_STATUSES_IGNORE));
976   if (stash->first_assembly_done) { /* Reuse the communication contexts, so consolidate and reset segrecvblocks  */
977     void *dummy;
978     PetscCall(PetscSegBufferExtractInPlace(stash->segrecvblocks,&dummy));
979   } else {                      /* No reuse, so collect everything. */
980     PetscCall(MatStashScatterDestroy_BTS(stash));
981   }
982 
983   /* Now update nmaxold to be app 10% more than max n used, this way the
984      wastage of space is reduced the next time this stash is used.
985      Also update the oldmax, only if it increases */
986   if (stash->n) {
987     PetscInt bs2     = stash->bs*stash->bs;
988     PetscInt oldnmax = ((int)(stash->n * 1.1) + 5)*bs2;
989     if (oldnmax > stash->oldnmax) stash->oldnmax = oldnmax;
990   }
991 
992   stash->nmax       = 0;
993   stash->n          = 0;
994   stash->reallocs   = -1;
995   stash->nprocessed = 0;
996 
997   PetscCall(PetscMatStashSpaceDestroy(&stash->space_head));
998 
999   stash->space = NULL;
1000 
1001   PetscFunctionReturn(0);
1002 }
1003 
1004 PetscErrorCode MatStashScatterDestroy_BTS(MatStash *stash)
1005 {
1006   PetscFunctionBegin;
1007   PetscCall(PetscSegBufferDestroy(&stash->segsendblocks));
1008   PetscCall(PetscSegBufferDestroy(&stash->segrecvframe));
1009   stash->recvframes = NULL;
1010   PetscCall(PetscSegBufferDestroy(&stash->segrecvblocks));
1011   if (stash->blocktype != MPI_DATATYPE_NULL) {
1012     PetscCallMPI(MPI_Type_free(&stash->blocktype));
1013   }
1014   stash->nsendranks = 0;
1015   stash->nrecvranks = 0;
1016   PetscCall(PetscFree3(stash->sendranks,stash->sendhdr,stash->sendframes));
1017   PetscCall(PetscFree(stash->sendreqs));
1018   PetscCall(PetscFree(stash->recvreqs));
1019   PetscCall(PetscFree(stash->recvranks));
1020   PetscCall(PetscFree(stash->recvhdr));
1021   PetscCall(PetscFree2(stash->some_indices,stash->some_statuses));
1022   PetscFunctionReturn(0);
1023 }
1024 #endif
1025