xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 71438e86cf8dea9f708c08733b37fef8eb68dc06)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
57fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6*71438e86SJunchao Zhang   #include <petsccublas.h>
7*71438e86SJunchao Zhang   typedef cudaStream_t  cupmStream_t;
8*71438e86SJunchao Zhang   typedef cudaEvent_t   cupmEvent_t;
97fd2d3dbSJunchao Zhang #endif
107fd2d3dbSJunchao Zhang 
117fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP)
12*71438e86SJunchao Zhang   #include <petschipblas.h>
13*71438e86SJunchao Zhang   typedef hipStream_t   cupmStream_t;
14*71438e86SJunchao Zhang   typedef hipEvent_t    cupmEvent_t;
157fd2d3dbSJunchao Zhang #endif
16cd620004SJunchao Zhang 
17874d28e3SJunchao Zhang /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
18874d28e3SJunchao Zhang    We perfer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
19874d28e3SJunchao Zhang  */
20874d28e3SJunchao Zhang typedef long long int          llint;
21874d28e3SJunchao Zhang typedef unsigned long long int ullint;
22874d28e3SJunchao Zhang 
23cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
24cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
2540e23c03SJunchao Zhang 
26fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
2740e23c03SJunchao Zhang 
28fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
29fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
3040e23c03SJunchao Zhang 
31fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
32fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
33fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
34cd620004SJunchao Zhang 
35fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
36fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
37fcc7397dSJunchao Zhang 
38fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
39fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
4040e23c03SJunchao Zhang  */
4140e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
42fcc7397dSJunchao Zhang   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
43b23bfdefSJunchao Zhang   PetscInt       n;           /* Number of destination ranks */
44fcc7397dSJunchao Zhang   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
45fcc7397dSJunchao Zhang   PetscInt       *start;      /* [n] First index */
46fcc7397dSJunchao Zhang   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
47fcc7397dSJunchao Zhang   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
4840e23c03SJunchao Zhang };
4940e23c03SJunchao Zhang 
50eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
5140e23c03SJunchao Zhang  */
52fcc7397dSJunchao Zhang struct _n_PetscSFLink {
53*71438e86SJunchao Zhang   PetscErrorCode (*Memcpy)              (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Async device memcopy might use stream in the link */
54*71438e86SJunchao Zhang   PetscErrorCode (*PrePack)             (PetscSF,PetscSFLink,PetscSFDirection);
55*71438e86SJunchao Zhang   PetscErrorCode (*PostUnpack)          (PetscSF,PetscSFLink,PetscSFDirection);
56*71438e86SJunchao Zhang   PetscErrorCode (*StartCommunication)  (PetscSF,PetscSFLink,PetscSFDirection);
57*71438e86SJunchao Zhang   PetscErrorCode (*FinishCommunication) (PetscSF,PetscSFLink,PetscSFDirection);
58*71438e86SJunchao Zhang   PetscErrorCode (*SyncDevice)          (PetscSFLink);
59*71438e86SJunchao Zhang   PetscErrorCode (*SyncStream)          (PetscSFLink);
60*71438e86SJunchao Zhang   PetscErrorCode (*Destroy)             (PetscSF,PetscSFLink);
61*71438e86SJunchao Zhang 
62*71438e86SJunchao Zhang   PetscErrorCode (*BuildDependenceBegin)(PetscSF,PetscSFLink,PetscSFDirection);
63*71438e86SJunchao Zhang   PetscErrorCode (*BuildDependenceEnd)  (PetscSF,PetscSFLink,PetscSFDirection);
6420c24465SJunchao Zhang 
65fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
66fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
67fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
68fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
69fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
70fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
71fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
72fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
73fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
74fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
75fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
76fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
77fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
78fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
79fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
80fcc7397dSJunchao Zhang 
81fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
82fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
83fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
84fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
85fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
86fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
87fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
88fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
89fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
90fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
91fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
92fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
93fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
94fcc7397dSJunchao Zhang 
95fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
96cd620004SJunchao Zhang 
97cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
987fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
99eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
1007fd2d3dbSJunchao Zhang      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
101eb02082bSJunchao Zhang   */
102fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
103fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
104fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
105fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
106fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
107fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
108fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
109fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
110fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
111fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
112fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
113fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
114fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
115fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
116fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
117eb02082bSJunchao Zhang 
118fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
119fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
120fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
121fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
122fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
123fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
124fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
125fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
126fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
127fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
128fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
129fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
130fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
131fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
132eb02082bSJunchao Zhang 
133eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
134fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
135fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
136fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
137fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
138fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
139fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
140fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
141fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
142fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
143fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
144fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
145fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
146fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
147fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
148cd620004SJunchao Zhang 
149fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
150fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
151fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
152fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
153fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
154fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
155fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
156fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
157fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
158fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
159fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
160fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
161fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
162fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
163*71438e86SJunchao Zhang  #if defined (PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
164e315309dSJunchao Zhang   PetscInt       maxResidentThreadsPerGPU;   /* It is a copy from SF for convenience */
165*71438e86SJunchao Zhang   cupmStream_t   stream;                     /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */
166eb02082bSJunchao Zhang  #endif
1677fd2d3dbSJunchao Zhang #endif
168eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
169cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
170eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
171e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
172eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
173eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
174cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
175cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
176cd620004SJunchao Zhang 
177cd620004SJunchao Zhang   /* For local and remote communication */
178cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
179cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
180cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
181cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
182cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
183*71438e86SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */
184*71438e86SJunchao Zhang                                              /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */
185cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
186cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
187cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
188cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
189cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
190cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
191cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
192cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
193cd620004SJunchao Zhang   PetscSFLink  next;
194*71438e86SJunchao Zhang 
195*71438e86SJunchao Zhang   PetscBool    use_nvshmem;                  /* Does this link use nvshem (vs. MPI) for communication? */
196*71438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
197*71438e86SJunchao Zhang   cupmEvent_t  dataReady;                    /* Events to mark readiness of root/leafdata */
198*71438e86SJunchao Zhang   cupmEvent_t  endRemoteComm;              /* Events to mark end of local/remote communication */
199*71438e86SJunchao Zhang   cupmStream_t remoteCommStream;             /* Streams for remote (i.e., inter-rank) communication */
200*71438e86SJunchao Zhang 
201*71438e86SJunchao Zhang   /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */
202*71438e86SJunchao Zhang   uint64_t     *rootSendSig,*rootRecvSig;    /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */
203*71438e86SJunchao Zhang   uint64_t     *leafSendSig,*leafRecvSig;    /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */
204*71438e86SJunchao Zhang #endif
20540e23c03SJunchao Zhang };
20640e23c03SJunchao Zhang 
207cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
208b7c0d12aSJunchao Zhang 
209cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
210cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
211cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
212cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
213cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
214*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink);
215cd620004SJunchao Zhang 
216cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
217fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
218eb02082bSJunchao Zhang {
219eb02082bSJunchao Zhang   PetscFunctionBegin;
220*71438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack;
2217fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
222cd620004SJunchao Zhang   else *Pack = link->d_Pack;
223eb02082bSJunchao Zhang #endif
224eb02082bSJunchao Zhang   PetscFunctionReturn(0);
225eb02082bSJunchao Zhang }
2267fd2d3dbSJunchao Zhang 
227fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
228fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
229fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
230fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
231cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
232b7c0d12aSJunchao Zhang 
233cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
234cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
235cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
236cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
237cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
238*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote (PetscSF,PetscSFLink,void*,MPI_Op);
239cd620004SJunchao Zhang 
240*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF,PetscSFLink,PetscSFDirection,void*,void*,MPI_Op);
241cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
242cd620004SJunchao Zhang 
2437fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
2447fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
245*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
2467fd2d3dbSJunchao Zhang 
24720c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
248*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF,PetscSFLink,MPI_Datatype);
24920c24465SJunchao Zhang #endif
25020c24465SJunchao Zhang 
25159af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP)
252*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF,PetscSFLink,MPI_Datatype);
25359af0bd3SScott Kruger #endif
25459af0bd3SScott Kruger 
25520c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS)
25620c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype);
25720c24465SJunchao Zhang #endif
25820c24465SJunchao Zhang 
259*71438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
260*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
261*71438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF,PetscMemType,const void*,PetscMemType,const void*,PetscBool*);
262*71438e86SJunchao Zhang #endif
263*71438e86SJunchao Zhang 
264*71438e86SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
265*71438e86SJunchao Zhang {
266*71438e86SJunchao Zhang   PetscFunctionBegin;
267*71438e86SJunchao Zhang   if (link->StartCommunication) {PetscErrorCode ierr = (*link->StartCommunication)(sf,link,direction);CHKERRQ(ierr);}
268*71438e86SJunchao Zhang   PetscFunctionReturn(0);
269*71438e86SJunchao Zhang }
270*71438e86SJunchao Zhang 
271*71438e86SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
272*71438e86SJunchao Zhang {
273*71438e86SJunchao Zhang   PetscFunctionBegin;
274*71438e86SJunchao Zhang   if (link->FinishCommunication) {PetscErrorCode ierr=(*link->FinishCommunication)(sf,link,direction);CHKERRQ(ierr);}
275*71438e86SJunchao Zhang   PetscFunctionReturn(0);
276*71438e86SJunchao Zhang }
277*71438e86SJunchao Zhang 
2787fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
279*71438e86SJunchao Zhang #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
2807fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
2817fd2d3dbSJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
2827fd2d3dbSJunchao Zhang */
2837fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
2847fd2d3dbSJunchao Zhang {
2857fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2867fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
2877fd2d3dbSJunchao Zhang 
2887fd2d3dbSJunchao Zhang   PetscFunctionBegin;
289*71438e86SJunchao Zhang   /* rootdata is on device but we use regular MPI for communication */
290*71438e86SJunchao Zhang   if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) {
2917fd2d3dbSJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
2927fd2d3dbSJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
2937fd2d3dbSJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
2947fd2d3dbSJunchao Zhang     if (device2host) {
29520c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
2967fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
2977fd2d3dbSJunchao Zhang     } else {
29820c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
2997fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3007fd2d3dbSJunchao Zhang     }
3017fd2d3dbSJunchao Zhang   }
3027fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3037fd2d3dbSJunchao Zhang }
3047fd2d3dbSJunchao Zhang 
3057fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
3067fd2d3dbSJunchao Zhang {
3077fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3087fd2d3dbSJunchao Zhang 
3097fd2d3dbSJunchao Zhang   PetscFunctionBegin;
310*71438e86SJunchao Zhang   if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) {
3117fd2d3dbSJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3127fd2d3dbSJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3137fd2d3dbSJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
3147fd2d3dbSJunchao Zhang     if (device2host) {
31520c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
3167fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
3177fd2d3dbSJunchao Zhang     } else {
31820c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
3197fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3207fd2d3dbSJunchao Zhang     }
3217fd2d3dbSJunchao Zhang   }
3227fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3237fd2d3dbSJunchao Zhang }
3247fd2d3dbSJunchao Zhang 
325*71438e86SJunchao Zhang /* Make sure root/leafbuf for the remote is ready for MPI */
326*71438e86SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
327*71438e86SJunchao Zhang {
328*71438e86SJunchao Zhang   PetscErrorCode ierr;
329*71438e86SJunchao Zhang   PetscSF_Basic  *bas;
330*71438e86SJunchao Zhang   PetscInt       buflen;
331*71438e86SJunchao Zhang   PetscMemType   mtype;
332*71438e86SJunchao Zhang 
333*71438e86SJunchao Zhang   PetscFunctionBegin;
334*71438e86SJunchao Zhang   if (direction == PETSCSF_ROOT2LEAF) {
335*71438e86SJunchao Zhang     bas    = (PetscSF_Basic*)sf->data;
336*71438e86SJunchao Zhang     mtype  = link->rootmtype;
337*71438e86SJunchao Zhang     buflen = bas->rootbuflen[PETSCSF_REMOTE];
338*71438e86SJunchao Zhang   } else {
339*71438e86SJunchao Zhang     mtype  = link->leafmtype;
340*71438e86SJunchao Zhang     buflen = sf->leafbuflen[PETSCSF_REMOTE];
341*71438e86SJunchao Zhang   }
342*71438e86SJunchao Zhang 
343*71438e86SJunchao Zhang   if (PetscMemTypeDevice(mtype) && buflen) {
344*71438e86SJunchao Zhang     ierr = (*link->SyncStream)(link);CHKERRQ(ierr);
345*71438e86SJunchao Zhang   }
346*71438e86SJunchao Zhang   PetscFunctionReturn(0);
347*71438e86SJunchao Zhang }
3487fd2d3dbSJunchao Zhang #else /* Host only */
3497fd2d3dbSJunchao Zhang   #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c)                  0
3507fd2d3dbSJunchao Zhang   #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c)                  0
351*71438e86SJunchao Zhang   #define PetscSFLinkSyncStreamBeforeCallMPI(a,b,c)                                0
3527fd2d3dbSJunchao Zhang #endif
353cd620004SJunchao Zhang 
354cd620004SJunchao Zhang /* Get root indices used for pack/unpack
355cd620004SJunchao Zhang 
356cd620004SJunchao Zhang Input arguments:
357cd620004SJunchao Zhang   +sf    - StarForest
358cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
359cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
360*71438e86SJunchao Zhang   -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
361cd620004SJunchao Zhang 
362cd620004SJunchao Zhang  Output arguments:
363cd620004SJunchao Zhang   +count   - Count of indices
364cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
365*71438e86SJunchao Zhang   .opt     - Packing optimizations
366*71438e86SJunchao Zhang   -indices - Indices of roots for pack/unpack. NULL means indices are contiguous
367cd620004SJunchao Zhang  */
368fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
369b7c0d12aSJunchao Zhang {
370cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
371cd620004SJunchao Zhang   PetscInt       offset;
372b7c0d12aSJunchao Zhang 
373b7c0d12aSJunchao Zhang   PetscFunctionBegin;
374fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
375fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
376fcc7397dSJunchao Zhang   *opt     = NULL;
377fcc7397dSJunchao Zhang   *indices = NULL;
378fcc7397dSJunchao Zhang 
379fcc7397dSJunchao Zhang   /* We have these rules:
380fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
381fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
382fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
383fcc7397dSJunchao Zhang   */
384fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
385cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
386*71438e86SJunchao Zhang     if (PetscMemTypeHost(mtype)) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
387cd620004SJunchao Zhang     else {
388fcc7397dSJunchao Zhang       PetscErrorCode ierr;
389fcc7397dSJunchao Zhang       size_t         size;
390fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
391fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
392fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
393fcc7397dSJunchao Zhang           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
394fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
39520c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr);
39620c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr);
397fcc7397dSJunchao Zhang         }
398fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
399fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
400fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
401fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope]*sizeof(PetscInt);
40220c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr);
40320c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr);
404b7c0d12aSJunchao Zhang         }
405cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
406cd620004SJunchao Zhang       }
407cd620004SJunchao Zhang     }
408cd620004SJunchao Zhang   }
409b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
410b7c0d12aSJunchao Zhang }
411b7c0d12aSJunchao Zhang 
412cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
413cd620004SJunchao Zhang 
414fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
415cd620004SJunchao Zhang  */
416fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
417cd620004SJunchao Zhang {
418cd620004SJunchao Zhang   PetscInt   offset;
419cd620004SJunchao Zhang 
420cd620004SJunchao Zhang   PetscFunctionBegin;
421fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
422fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
423fcc7397dSJunchao Zhang   *opt     = NULL;
424fcc7397dSJunchao Zhang   *indices = NULL;
425fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
426cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
427*71438e86SJunchao Zhang     if (PetscMemTypeHost(mtype)) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
428cd620004SJunchao Zhang     else {
429fcc7397dSJunchao Zhang       PetscErrorCode ierr;
430fcc7397dSJunchao Zhang       size_t         size;
431fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
432fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
433fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
434fcc7397dSJunchao Zhang           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
435fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
43620c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */
43720c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr);
438fcc7397dSJunchao Zhang         }
439fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
440fcc7397dSJunchao Zhang       } else {
441fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
442fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope]*sizeof(PetscInt);
44320c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr);
44420c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr);
445cd620004SJunchao Zhang         }
446cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
447cd620004SJunchao Zhang       }
448cd620004SJunchao Zhang     }
449cd620004SJunchao Zhang   }
450cd620004SJunchao Zhang   PetscFunctionReturn(0);
451cd620004SJunchao Zhang }
45240e23c03SJunchao Zhang #endif
453