xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 7fd2d3dbf8105d3f2e002a5ca11f019cd0ad7420)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
6*7fd2d3dbSJunchao Zhang   #include <cuda_runtime.h> /* For cudaStream_t */
7*7fd2d3dbSJunchao Zhang   #include <petsccublas.h>  /* For CHKERRCUDA */
8*7fd2d3dbSJunchao Zhang #endif
9*7fd2d3dbSJunchao Zhang 
10*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP)
11*7fd2d3dbSJunchao Zhang   #include <hip/hip_runtime.h>  /* For hipStream_t */
12*7fd2d3dbSJunchao Zhang #endif
13cd620004SJunchao Zhang 
14cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
15cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
1640e23c03SJunchao Zhang 
17fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
1840e23c03SJunchao Zhang 
19fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
20fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
2140e23c03SJunchao Zhang 
22fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
23fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
24fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
25cd620004SJunchao Zhang 
26fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
27fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
28fcc7397dSJunchao Zhang 
29fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
30fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
3140e23c03SJunchao Zhang  */
3240e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
33fcc7397dSJunchao Zhang   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
34b23bfdefSJunchao Zhang   PetscInt       n;           /* Number of destination ranks */
35fcc7397dSJunchao Zhang   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
36fcc7397dSJunchao Zhang   PetscInt       *start;      /* [n] First index */
37fcc7397dSJunchao Zhang   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
38fcc7397dSJunchao Zhang   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
3940e23c03SJunchao Zhang };
4040e23c03SJunchao Zhang 
41eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
4240e23c03SJunchao Zhang  */
43fcc7397dSJunchao Zhang struct _n_PetscSFLink {
44fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
45fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
46fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
47fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
48fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
49fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
50fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
51fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
52fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
53fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
54fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
55fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
56fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
57fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
58fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
59fcc7397dSJunchao Zhang 
60fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
61fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
62fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
63fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
64fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
65fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
66fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
67fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
68fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
69fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
70fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
71fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
72fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
73fcc7397dSJunchao Zhang 
74fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
75cd620004SJunchao Zhang 
76cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
77*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
78eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
79*7fd2d3dbSJunchao Zhang      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
80eb02082bSJunchao Zhang   */
81fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
82fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
83fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
84fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
85fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
86fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
87fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
88fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
89fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
90fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
91fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
92fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
93fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
94fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
95fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
96eb02082bSJunchao Zhang 
97fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
98fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
99fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
100fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
101fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
102fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
103fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
104fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
105fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
106fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
107fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
108fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
109fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
110fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
111eb02082bSJunchao Zhang 
112eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
113fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
114fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
115fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
116fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
117fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
118fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
119fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
120fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
121fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
122fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
123fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
124fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
125fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
126fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
127cd620004SJunchao Zhang 
128fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
129fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
130fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
131fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
132fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
133fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
134fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
135fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
136fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
137fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
138fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
139fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
140fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
141fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
142*7fd2d3dbSJunchao Zhang #if defined (PETSC_HAVE_CUDA)
143e315309dSJunchao Zhang   PetscInt       maxResidentThreadsPerGPU;   /* It is a copy from SF for convenience */
144eb02082bSJunchao Zhang   cudaStream_t   stream;                     /* Stream to launch pack/unapck kernels if not using the default stream */
145*7fd2d3dbSJunchao Zhang #elif defined (PETSC_HAVE_HIP)
146*7fd2d3dbSJunchao Zhang   hipStream_t    stream;
147eb02082bSJunchao Zhang #endif
148*7fd2d3dbSJunchao Zhang 
149*7fd2d3dbSJunchao Zhang   PetscErrorCode (*Destroy)(PetscSFLink);    /* Device specific destroy function */
150*7fd2d3dbSJunchao Zhang   void           *sptr;
151*7fd2d3dbSJunchao Zhang #endif
152*7fd2d3dbSJunchao Zhang 
153eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
154cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
155eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
156e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
157eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
158eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
159cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
160cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
161cd620004SJunchao Zhang 
162cd620004SJunchao Zhang   /* For local and remote communication */
163cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
164cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
165cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
166cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
167cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
168cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
169cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
170cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
171cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
172cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
173cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
174cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
175cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
176cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
177cd620004SJunchao Zhang   PetscSFLink  next;
17840e23c03SJunchao Zhang };
17940e23c03SJunchao Zhang 
180cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
181b7c0d12aSJunchao Zhang 
182cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
183cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
184cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
185cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
186cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
187cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
188cd620004SJunchao Zhang 
189cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
190fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
191eb02082bSJunchao Zhang {
192eb02082bSJunchao Zhang   PetscFunctionBegin;
193eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
194*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
195cd620004SJunchao Zhang   else *Pack = link->d_Pack;
196eb02082bSJunchao Zhang #endif
197eb02082bSJunchao Zhang   PetscFunctionReturn(0);
198eb02082bSJunchao Zhang }
199*7fd2d3dbSJunchao Zhang 
200*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
201*7fd2d3dbSJunchao Zhang {
202*7fd2d3dbSJunchao Zhang   PetscErrorCode       ierr;
203*7fd2d3dbSJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
204*7fd2d3dbSJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
205*7fd2d3dbSJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
206*7fd2d3dbSJunchao Zhang 
207*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
208*7fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
209*7fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
210*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
211*7fd2d3dbSJunchao Zhang }
212*7fd2d3dbSJunchao Zhang 
213fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
214fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
215fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
216fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
217cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
218b7c0d12aSJunchao Zhang 
219cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
220cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
221cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
222cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
223cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
224cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
225cd620004SJunchao Zhang 
226cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
227cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
228cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
229cd620004SJunchao Zhang 
230*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
231*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
232*7fd2d3dbSJunchao Zhang 
233*7fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
234*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
235*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
236*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSyncDevice(PetscSF,PetscSFLink);
237*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSyncStream(PetscSF,PetscSFLink);
238*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkMemcpy(PetscSF,PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t);
239*7fd2d3dbSJunchao Zhang 
240*7fd2d3dbSJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
241*7fd2d3dbSJunchao Zhang    make sure the data is ready for packing.
242*7fd2d3dbSJunchao Zhang  */
243*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
244*7fd2d3dbSJunchao Zhang {
245*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
246*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
247*7fd2d3dbSJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
248*7fd2d3dbSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscSFLinkSyncDevice(sf,link);CHKERRQ(ierr);}
249*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
250*7fd2d3dbSJunchao Zhang }
251*7fd2d3dbSJunchao Zhang 
252*7fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
253*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
254*7fd2d3dbSJunchao Zhang {
255*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
256*7fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
257*7fd2d3dbSJunchao Zhang 
258*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
259*7fd2d3dbSJunchao Zhang   /* Do nothing if we use stream aware mpi || has nothing for remote */
260*7fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
261*7fd2d3dbSJunchao Zhang   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
262*7fd2d3dbSJunchao Zhang   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);}
263*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
264*7fd2d3dbSJunchao Zhang }
265*7fd2d3dbSJunchao Zhang 
266*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
267*7fd2d3dbSJunchao Zhang {
268*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
269*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
270*7fd2d3dbSJunchao Zhang   /* See comments above */
271*7fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
272*7fd2d3dbSJunchao Zhang   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);}
273*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
274*7fd2d3dbSJunchao Zhang }
275*7fd2d3dbSJunchao Zhang 
276*7fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
277*7fd2d3dbSJunchao Zhang    does not know which stream the callers will use.
278*7fd2d3dbSJunchao Zhang */
279*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
280*7fd2d3dbSJunchao Zhang {
281*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
282*7fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
283*7fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
284*7fd2d3dbSJunchao Zhang 
285*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
286*7fd2d3dbSJunchao Zhang   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
287*7fd2d3dbSJunchao Zhang   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
288*7fd2d3dbSJunchao Zhang 
289*7fd2d3dbSJunchao Zhang   /* If rootmtype is HOST or DEVICE:
290*7fd2d3dbSJunchao Zhang      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
291*7fd2d3dbSJunchao Zhang      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
292*7fd2d3dbSJunchao Zhang      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
293*7fd2d3dbSJunchao Zhang 
294*7fd2d3dbSJunchao Zhang      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
295*7fd2d3dbSJunchao Zhang      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
296*7fd2d3dbSJunchao Zhang    */
297*7fd2d3dbSJunchao Zhang   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);}
298*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
299*7fd2d3dbSJunchao Zhang }
300*7fd2d3dbSJunchao Zhang 
301*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
302*7fd2d3dbSJunchao Zhang {
303*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
304*7fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
305*7fd2d3dbSJunchao Zhang 
306*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
307*7fd2d3dbSJunchao Zhang   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
308*7fd2d3dbSJunchao Zhang   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
309*7fd2d3dbSJunchao Zhang   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);}
310*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
311*7fd2d3dbSJunchao Zhang }
312*7fd2d3dbSJunchao Zhang 
313*7fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
314*7fd2d3dbSJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
315*7fd2d3dbSJunchao Zhang */
316*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
317*7fd2d3dbSJunchao Zhang {
318*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
319*7fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
320*7fd2d3dbSJunchao Zhang 
321*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
322*7fd2d3dbSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
323*7fd2d3dbSJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
324*7fd2d3dbSJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
325*7fd2d3dbSJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
326*7fd2d3dbSJunchao Zhang     if (device2host) {
327*7fd2d3dbSJunchao Zhang       ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
328*7fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
329*7fd2d3dbSJunchao Zhang     } else {
330*7fd2d3dbSJunchao Zhang       ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
331*7fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
332*7fd2d3dbSJunchao Zhang     }
333*7fd2d3dbSJunchao Zhang   }
334*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
335*7fd2d3dbSJunchao Zhang }
336*7fd2d3dbSJunchao Zhang 
337*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
338*7fd2d3dbSJunchao Zhang {
339*7fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
340*7fd2d3dbSJunchao Zhang 
341*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
342*7fd2d3dbSJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
343*7fd2d3dbSJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
344*7fd2d3dbSJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
345*7fd2d3dbSJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
346*7fd2d3dbSJunchao Zhang     if (device2host) {
347*7fd2d3dbSJunchao Zhang       ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
348*7fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
349*7fd2d3dbSJunchao Zhang     } else {
350*7fd2d3dbSJunchao Zhang       ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
351*7fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
352*7fd2d3dbSJunchao Zhang     }
353*7fd2d3dbSJunchao Zhang   }
354*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
355*7fd2d3dbSJunchao Zhang }
356*7fd2d3dbSJunchao Zhang 
357*7fd2d3dbSJunchao Zhang #else /* Host only */
358*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
359*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
360*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
361*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
362*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
363*7fd2d3dbSJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
364*7fd2d3dbSJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
365*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
366*7fd2d3dbSJunchao Zhang {
367*7fd2d3dbSJunchao Zhang   PetscFunctionBegin;
368*7fd2d3dbSJunchao Zhang   if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
369*7fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
370*7fd2d3dbSJunchao Zhang }
371*7fd2d3dbSJunchao Zhang #endif
372cd620004SJunchao Zhang 
373cd620004SJunchao Zhang /* Get root indices used for pack/unpack
374cd620004SJunchao Zhang 
375cd620004SJunchao Zhang Input arguments:
376cd620004SJunchao Zhang   +sf    - StarForest
377cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
378cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
379cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
380cd620004SJunchao Zhang 
381cd620004SJunchao Zhang  Output arguments:
382cd620004SJunchao Zhang   +count   - Count of indices
383cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
384cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
385cd620004SJunchao Zhang  */
386fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
387b7c0d12aSJunchao Zhang {
388cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
389cd620004SJunchao Zhang   PetscInt       offset;
390b7c0d12aSJunchao Zhang 
391b7c0d12aSJunchao Zhang   PetscFunctionBegin;
392fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
393fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
394fcc7397dSJunchao Zhang   *opt     = NULL;
395fcc7397dSJunchao Zhang   *indices = NULL;
396fcc7397dSJunchao Zhang 
397fcc7397dSJunchao Zhang   /* We have these rules:
398fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
399fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
400fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
401fcc7397dSJunchao Zhang   */
402fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
403cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
404fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
405*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
406cd620004SJunchao Zhang     else {
407fcc7397dSJunchao Zhang       PetscErrorCode ierr;
408fcc7397dSJunchao Zhang       size_t         size;
409fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
410fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
411fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
412fcc7397dSJunchao Zhang           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
413fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
414*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr);
415*7fd2d3dbSJunchao Zhang           ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr);
416fcc7397dSJunchao Zhang         }
417fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
418fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
419fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
420fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope]*sizeof(PetscInt);
421*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr);
422*7fd2d3dbSJunchao Zhang           ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr);
423b7c0d12aSJunchao Zhang         }
424cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
425cd620004SJunchao Zhang       }
426cd620004SJunchao Zhang     }
427fcc7397dSJunchao Zhang #endif
428cd620004SJunchao Zhang   }
429b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
430b7c0d12aSJunchao Zhang }
431b7c0d12aSJunchao Zhang 
432cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
433cd620004SJunchao Zhang 
434fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
435cd620004SJunchao Zhang  */
436fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
437cd620004SJunchao Zhang {
438cd620004SJunchao Zhang   PetscInt   offset;
439cd620004SJunchao Zhang 
440cd620004SJunchao Zhang   PetscFunctionBegin;
441fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
442fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
443fcc7397dSJunchao Zhang   *opt     = NULL;
444fcc7397dSJunchao Zhang   *indices = NULL;
445fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
446cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
447fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
448*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
449cd620004SJunchao Zhang     else {
450fcc7397dSJunchao Zhang       PetscErrorCode ierr;
451fcc7397dSJunchao Zhang       size_t         size;
452fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
453fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
454fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
455fcc7397dSJunchao Zhang           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
456fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
457*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */
458*7fd2d3dbSJunchao Zhang           ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr);
459fcc7397dSJunchao Zhang         }
460fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
461fcc7397dSJunchao Zhang       } else {
462fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
463fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope]*sizeof(PetscInt);
464*7fd2d3dbSJunchao Zhang           ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr);
465*7fd2d3dbSJunchao Zhang           ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr);
466cd620004SJunchao Zhang         }
467cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
468cd620004SJunchao Zhang       }
469cd620004SJunchao Zhang     }
470fcc7397dSJunchao Zhang #endif
471cd620004SJunchao Zhang   }
472cd620004SJunchao Zhang   PetscFunctionReturn(0);
473cd620004SJunchao Zhang }
47440e23c03SJunchao Zhang #endif
475