xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 20c244659fd9c3c7aed709f621eec617a23dc3d1)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
57fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
67fd2d3dbSJunchao Zhang   #include <cuda_runtime.h> /* For cudaStream_t */
77fd2d3dbSJunchao Zhang   #include <petsccublas.h>  /* For CHKERRCUDA */
87fd2d3dbSJunchao Zhang #endif
97fd2d3dbSJunchao Zhang 
107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP)
117fd2d3dbSJunchao Zhang   #include <hip/hip_runtime.h>  /* For hipStream_t */
127fd2d3dbSJunchao Zhang #endif
13cd620004SJunchao Zhang 
14cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
15cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
1640e23c03SJunchao Zhang 
17fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
1840e23c03SJunchao Zhang 
19fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
20fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
2140e23c03SJunchao Zhang 
22fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
23fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
24fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
25cd620004SJunchao Zhang 
26fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
27fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
28fcc7397dSJunchao Zhang 
29fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
30fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
3140e23c03SJunchao Zhang  */
3240e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
33fcc7397dSJunchao Zhang   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
34b23bfdefSJunchao Zhang   PetscInt       n;           /* Number of destination ranks */
35fcc7397dSJunchao Zhang   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
36fcc7397dSJunchao Zhang   PetscInt       *start;      /* [n] First index */
37fcc7397dSJunchao Zhang   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
38fcc7397dSJunchao Zhang   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
3940e23c03SJunchao Zhang };
4040e23c03SJunchao Zhang 
41eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
4240e23c03SJunchao Zhang  */
43fcc7397dSJunchao Zhang struct _n_PetscSFLink {
44*20c24465SJunchao Zhang   PetscErrorCode (*Memcpy)            (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Asynchronous copy might use stream in the link */
45*20c24465SJunchao Zhang 
46fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
47fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
48fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
49fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
50fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
51fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
52fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
53fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
54fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
55fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
56fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
57fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
58fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
59fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
60fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
61fcc7397dSJunchao Zhang 
62fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
63fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
64fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
65fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
66fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
67fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
68fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
69fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
70fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
71fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
72fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
73fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
74fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
75fcc7397dSJunchao Zhang 
76fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
77cd620004SJunchao Zhang 
78cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
797fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
80eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
817fd2d3dbSJunchao Zhang      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
82eb02082bSJunchao Zhang   */
83*20c24465SJunchao Zhang   PetscErrorCode (*d_SyncDevice)      (PetscSFLink);
84*20c24465SJunchao Zhang   PetscErrorCode (*d_SyncStream)      (PetscSFLink);
85*20c24465SJunchao Zhang 
86fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
87fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
88fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
89fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
90fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
91fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
92fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
93fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
94fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
95fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
96fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
97fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
98fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
99fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
100fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
101eb02082bSJunchao Zhang 
102fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
103fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
104fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
105fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
106fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
107fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
108fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
109fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
110fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
111fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
112fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
113fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
114fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
115fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
116eb02082bSJunchao Zhang 
117eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
118fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
119fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
120fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
121fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
122fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
123fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
124fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
125fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
126fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
127fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
128fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
129fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
130fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
131fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
132cd620004SJunchao Zhang 
133fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
134fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
135fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
136fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
137fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
138fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
139fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
140fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
141fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
142fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
143fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
144fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
145fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
146fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
1477fd2d3dbSJunchao Zhang #if defined (PETSC_HAVE_CUDA)
148e315309dSJunchao Zhang   PetscInt       maxResidentThreadsPerGPU;   /* It is a copy from SF for convenience */
149eb02082bSJunchao Zhang   cudaStream_t   stream;                     /* Stream to launch pack/unapck kernels if not using the default stream */
1507fd2d3dbSJunchao Zhang #elif defined (PETSC_HAVE_HIP)
1517fd2d3dbSJunchao Zhang   hipStream_t    stream;
152eb02082bSJunchao Zhang #endif
1537fd2d3dbSJunchao Zhang 
1547fd2d3dbSJunchao Zhang   PetscErrorCode (*Destroy)(PetscSFLink);    /* Device specific destroy function */
1557fd2d3dbSJunchao Zhang   void           *sptr;
1567fd2d3dbSJunchao Zhang #endif
1577fd2d3dbSJunchao Zhang 
158eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
159cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
160eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
161e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
162eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
163eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
164cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
165cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
166cd620004SJunchao Zhang 
167cd620004SJunchao Zhang   /* For local and remote communication */
168cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
169cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
170cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
171cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
172cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
173cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
174cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
175cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
176cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
177cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
178cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
179cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
180cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
181cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
182cd620004SJunchao Zhang   PetscSFLink  next;
18340e23c03SJunchao Zhang };
18440e23c03SJunchao Zhang 
185cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
186b7c0d12aSJunchao Zhang 
187cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
188cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
189cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
190cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
191cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
192cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
193cd620004SJunchao Zhang 
194cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
195fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
196eb02082bSJunchao Zhang {
197eb02082bSJunchao Zhang   PetscFunctionBegin;
198eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
1997fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
200cd620004SJunchao Zhang   else *Pack = link->d_Pack;
201eb02082bSJunchao Zhang #endif
202eb02082bSJunchao Zhang   PetscFunctionReturn(0);
203eb02082bSJunchao Zhang }
2047fd2d3dbSJunchao Zhang 
2057fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
2067fd2d3dbSJunchao Zhang {
2077fd2d3dbSJunchao Zhang   PetscErrorCode       ierr;
2087fd2d3dbSJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
2097fd2d3dbSJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
2107fd2d3dbSJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
2117fd2d3dbSJunchao Zhang 
2127fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2137fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
2147fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
2157fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2167fd2d3dbSJunchao Zhang }
2177fd2d3dbSJunchao Zhang 
218fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
219fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
220fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
221fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
222cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
223b7c0d12aSJunchao Zhang 
224cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
225cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
226cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
227cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
228cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
229cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
230cd620004SJunchao Zhang 
231cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
232cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
233cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
234cd620004SJunchao Zhang 
2357fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
2367fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
2377fd2d3dbSJunchao Zhang 
238*20c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
239*20c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF,PetscSFLink,MPI_Datatype);
240*20c24465SJunchao Zhang #endif
241*20c24465SJunchao Zhang 
242*20c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS)
243*20c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype);
244*20c24465SJunchao Zhang #endif
245*20c24465SJunchao Zhang 
2467fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
2477fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
2487fd2d3dbSJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
2497fd2d3dbSJunchao Zhang    make sure the data is ready for packing.
2507fd2d3dbSJunchao Zhang  */
2517fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
2527fd2d3dbSJunchao Zhang {
2537fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2547fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2557fd2d3dbSJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
256*20c24465SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = (*link->d_SyncDevice)(link);CHKERRQ(ierr);}
2577fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2587fd2d3dbSJunchao Zhang }
2597fd2d3dbSJunchao Zhang 
2607fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
2617fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
2627fd2d3dbSJunchao Zhang {
2637fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2647fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
2657fd2d3dbSJunchao Zhang 
2667fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2677fd2d3dbSJunchao Zhang   /* Do nothing if we use stream aware mpi || has nothing for remote */
2687fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
2697fd2d3dbSJunchao Zhang   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
270*20c24465SJunchao Zhang   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
2717fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2727fd2d3dbSJunchao Zhang }
2737fd2d3dbSJunchao Zhang 
2747fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
2757fd2d3dbSJunchao Zhang {
2767fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2777fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2787fd2d3dbSJunchao Zhang   /* See comments above */
2797fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
280*20c24465SJunchao Zhang   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
2817fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2827fd2d3dbSJunchao Zhang }
2837fd2d3dbSJunchao Zhang 
2847fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
2857fd2d3dbSJunchao Zhang    does not know which stream the callers will use.
2867fd2d3dbSJunchao Zhang */
2877fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
2887fd2d3dbSJunchao Zhang {
2897fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2907fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
2917fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
2927fd2d3dbSJunchao Zhang 
2937fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2947fd2d3dbSJunchao Zhang   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
2957fd2d3dbSJunchao Zhang   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
2967fd2d3dbSJunchao Zhang 
2977fd2d3dbSJunchao Zhang   /* If rootmtype is HOST or DEVICE:
2987fd2d3dbSJunchao Zhang      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
2997fd2d3dbSJunchao Zhang      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
3007fd2d3dbSJunchao Zhang      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
3017fd2d3dbSJunchao Zhang 
3027fd2d3dbSJunchao Zhang      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
3037fd2d3dbSJunchao Zhang      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
3047fd2d3dbSJunchao Zhang    */
305*20c24465SJunchao Zhang   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
3067fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3077fd2d3dbSJunchao Zhang }
3087fd2d3dbSJunchao Zhang 
3097fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
3107fd2d3dbSJunchao Zhang {
3117fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3127fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
3137fd2d3dbSJunchao Zhang 
3147fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3157fd2d3dbSJunchao Zhang   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
3167fd2d3dbSJunchao Zhang   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
317*20c24465SJunchao Zhang   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
3187fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3197fd2d3dbSJunchao Zhang }
3207fd2d3dbSJunchao Zhang 
3217fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
3227fd2d3dbSJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
3237fd2d3dbSJunchao Zhang */
3247fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
3257fd2d3dbSJunchao Zhang {
3267fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3277fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
3287fd2d3dbSJunchao Zhang 
3297fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3307fd2d3dbSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
3317fd2d3dbSJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3327fd2d3dbSJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3337fd2d3dbSJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
3347fd2d3dbSJunchao Zhang     if (device2host) {
335*20c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
3367fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
3377fd2d3dbSJunchao Zhang     } else {
338*20c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
3397fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3407fd2d3dbSJunchao Zhang     }
3417fd2d3dbSJunchao Zhang   }
3427fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3437fd2d3dbSJunchao Zhang }
3447fd2d3dbSJunchao Zhang 
3457fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
3467fd2d3dbSJunchao Zhang {
3477fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3487fd2d3dbSJunchao Zhang 
3497fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3507fd2d3dbSJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
3517fd2d3dbSJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3527fd2d3dbSJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3537fd2d3dbSJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
3547fd2d3dbSJunchao Zhang     if (device2host) {
355*20c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
3567fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
3577fd2d3dbSJunchao Zhang     } else {
358*20c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
3597fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3607fd2d3dbSJunchao Zhang     }
3617fd2d3dbSJunchao Zhang   }
3627fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3637fd2d3dbSJunchao Zhang }
3647fd2d3dbSJunchao Zhang 
3657fd2d3dbSJunchao Zhang #else /* Host only */
3667fd2d3dbSJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
3677fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
3687fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
3697fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
3707fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
3717fd2d3dbSJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
3727fd2d3dbSJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
3737fd2d3dbSJunchao Zhang #endif
374cd620004SJunchao Zhang 
375cd620004SJunchao Zhang /* Get root indices used for pack/unpack
376cd620004SJunchao Zhang 
377cd620004SJunchao Zhang Input arguments:
378cd620004SJunchao Zhang   +sf    - StarForest
379cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
380cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
381cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
382cd620004SJunchao Zhang 
383cd620004SJunchao Zhang  Output arguments:
384cd620004SJunchao Zhang   +count   - Count of indices
385cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
386cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
387cd620004SJunchao Zhang  */
388fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
389b7c0d12aSJunchao Zhang {
390cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
391cd620004SJunchao Zhang   PetscInt       offset;
392b7c0d12aSJunchao Zhang 
393b7c0d12aSJunchao Zhang   PetscFunctionBegin;
394fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
395fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
396fcc7397dSJunchao Zhang   *opt     = NULL;
397fcc7397dSJunchao Zhang   *indices = NULL;
398fcc7397dSJunchao Zhang 
399fcc7397dSJunchao Zhang   /* We have these rules:
400fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
401fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
402fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
403fcc7397dSJunchao Zhang   */
404fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
405cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
406fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
4077fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
408cd620004SJunchao Zhang     else {
409fcc7397dSJunchao Zhang       PetscErrorCode ierr;
410fcc7397dSJunchao Zhang       size_t         size;
411fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
412fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
413fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
414fcc7397dSJunchao Zhang           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
415fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
416*20c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr);
417*20c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr);
418fcc7397dSJunchao Zhang         }
419fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
420fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
421fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
422fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope]*sizeof(PetscInt);
423*20c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr);
424*20c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr);
425b7c0d12aSJunchao Zhang         }
426cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
427cd620004SJunchao Zhang       }
428cd620004SJunchao Zhang     }
429fcc7397dSJunchao Zhang #endif
430cd620004SJunchao Zhang   }
431b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
432b7c0d12aSJunchao Zhang }
433b7c0d12aSJunchao Zhang 
434cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
435cd620004SJunchao Zhang 
436fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
437cd620004SJunchao Zhang  */
438fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
439cd620004SJunchao Zhang {
440cd620004SJunchao Zhang   PetscInt   offset;
441cd620004SJunchao Zhang 
442cd620004SJunchao Zhang   PetscFunctionBegin;
443fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
444fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
445fcc7397dSJunchao Zhang   *opt     = NULL;
446fcc7397dSJunchao Zhang   *indices = NULL;
447fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
448cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
449fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
4507fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
451cd620004SJunchao Zhang     else {
452fcc7397dSJunchao Zhang       PetscErrorCode ierr;
453fcc7397dSJunchao Zhang       size_t         size;
454fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
455fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
456fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
457fcc7397dSJunchao Zhang           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
458fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
459*20c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */
460*20c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr);
461fcc7397dSJunchao Zhang         }
462fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
463fcc7397dSJunchao Zhang       } else {
464fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
465fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope]*sizeof(PetscInt);
466*20c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr);
467*20c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr);
468cd620004SJunchao Zhang         }
469cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
470cd620004SJunchao Zhang       }
471cd620004SJunchao Zhang     }
472fcc7397dSJunchao Zhang #endif
473cd620004SJunchao Zhang   }
474cd620004SJunchao Zhang   PetscFunctionReturn(0);
475cd620004SJunchao Zhang }
47640e23c03SJunchao Zhang #endif
477