xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 59af0bd3658d6c64d35e37f76ad6a8a026fa611f)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
57fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
67fd2d3dbSJunchao Zhang   #include <cuda_runtime.h> /* For cudaStream_t */
77fd2d3dbSJunchao Zhang   #include <petsccublas.h>  /* For CHKERRCUDA */
87fd2d3dbSJunchao Zhang #endif
97fd2d3dbSJunchao Zhang 
107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP)
117fd2d3dbSJunchao Zhang   #include <hip/hip_runtime.h>  /* For hipStream_t */
12*59af0bd3SScott Kruger   #include <petschipblas.h>  /* For CHKERRHIP */
137fd2d3dbSJunchao Zhang #endif
14cd620004SJunchao Zhang 
15874d28e3SJunchao Zhang /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
16874d28e3SJunchao Zhang    We perfer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
17874d28e3SJunchao Zhang  */
18874d28e3SJunchao Zhang typedef long long int          llint;
19874d28e3SJunchao Zhang typedef unsigned long long int ullint;
20874d28e3SJunchao Zhang 
21cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
22cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
2340e23c03SJunchao Zhang 
24fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
2540e23c03SJunchao Zhang 
26fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
27fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
2840e23c03SJunchao Zhang 
29fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
30fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
31fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
32cd620004SJunchao Zhang 
33fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
34fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
35fcc7397dSJunchao Zhang 
36fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
37fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
3840e23c03SJunchao Zhang  */
3940e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
40fcc7397dSJunchao Zhang   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
41b23bfdefSJunchao Zhang   PetscInt       n;           /* Number of destination ranks */
42fcc7397dSJunchao Zhang   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
43fcc7397dSJunchao Zhang   PetscInt       *start;      /* [n] First index */
44fcc7397dSJunchao Zhang   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
45fcc7397dSJunchao Zhang   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
4640e23c03SJunchao Zhang };
4740e23c03SJunchao Zhang 
48eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
4940e23c03SJunchao Zhang  */
50fcc7397dSJunchao Zhang struct _n_PetscSFLink {
5120c24465SJunchao Zhang   PetscErrorCode (*Memcpy)            (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Asynchronous copy might use stream in the link */
5220c24465SJunchao Zhang 
53fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
54fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
55fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
56fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
57fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
58fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
59fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
60fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
61fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
62fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
63fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
64fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
65fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
66fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
67fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
68fcc7397dSJunchao Zhang 
69fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
70fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
71fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
72fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
73fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
74fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
75fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
76fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
77fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
78fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
79fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
80fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
81fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
82fcc7397dSJunchao Zhang 
83fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
84cd620004SJunchao Zhang 
85cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
867fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
87eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
887fd2d3dbSJunchao Zhang      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
89eb02082bSJunchao Zhang   */
9020c24465SJunchao Zhang   PetscErrorCode (*d_SyncDevice)      (PetscSFLink);
9120c24465SJunchao Zhang   PetscErrorCode (*d_SyncStream)      (PetscSFLink);
9220c24465SJunchao Zhang 
93fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
94fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
95fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
96fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
97fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
98fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
99fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
100fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
101fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
102fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
103fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
104fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
105fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
106fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
107fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
108eb02082bSJunchao Zhang 
109fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
110fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
111fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
112fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
113fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
114fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
115fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
116fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
117fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
118fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
119fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
120fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
121fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
122fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
123eb02082bSJunchao Zhang 
124eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
125fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
126fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
127fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
128fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
129fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
130fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
131fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
132fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
133fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
134fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
135fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
136fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
137fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
138fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
139cd620004SJunchao Zhang 
140fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
141fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
142fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
143fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
144fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
145fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
146fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
147fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
148fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
149fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
150fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
151fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
152fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
153fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
154*59af0bd3SScott Kruger /* TODO:  Make runtime  */
1557fd2d3dbSJunchao Zhang #if defined (PETSC_HAVE_CUDA)
156e315309dSJunchao Zhang   PetscInt       maxResidentThreadsPerGPU;   /* It is a copy from SF for convenience */
157eb02082bSJunchao Zhang   cudaStream_t   stream;                     /* Stream to launch pack/unapck kernels if not using the default stream */
1587fd2d3dbSJunchao Zhang #elif defined (PETSC_HAVE_HIP)
159*59af0bd3SScott Kruger   PetscInt       maxResidentThreadsPerGPU;   /* It is a copy from SF for convenience */
1607fd2d3dbSJunchao Zhang   hipStream_t    stream;
161eb02082bSJunchao Zhang #endif
1627fd2d3dbSJunchao Zhang 
163f4af43b4SJunchao Zhang   PetscErrorCode (*Destroy)(PetscSFLink);    /* These two fields are meant to be used by SF_Kokkos, with spptr pointing to an execution space object */
164f4af43b4SJunchao Zhang   void           *spptr;                     /* for a given stream, but unused now due to a Kokkos bug, so that SF_Kokkos only supports null stream. */
1657fd2d3dbSJunchao Zhang #endif
166eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
167cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
168eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
169e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
170eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
171eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
172cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
173cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
174cd620004SJunchao Zhang 
175cd620004SJunchao Zhang   /* For local and remote communication */
176cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
177cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
178cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
179cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
180cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
181cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
182cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
183cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
184cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
185cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
186cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
187cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
188cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
189cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
190cd620004SJunchao Zhang   PetscSFLink  next;
19140e23c03SJunchao Zhang };
19240e23c03SJunchao Zhang 
193cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
194b7c0d12aSJunchao Zhang 
195cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
196cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
197cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
198cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
199cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
200cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
201cd620004SJunchao Zhang 
202cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
203fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
204eb02082bSJunchao Zhang {
205eb02082bSJunchao Zhang   PetscFunctionBegin;
206eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
2077fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
208cd620004SJunchao Zhang   else *Pack = link->d_Pack;
209eb02082bSJunchao Zhang #endif
210eb02082bSJunchao Zhang   PetscFunctionReturn(0);
211eb02082bSJunchao Zhang }
2127fd2d3dbSJunchao Zhang 
2137fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
2147fd2d3dbSJunchao Zhang {
2157fd2d3dbSJunchao Zhang   PetscErrorCode       ierr;
2167fd2d3dbSJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
2177fd2d3dbSJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
2187fd2d3dbSJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
2197fd2d3dbSJunchao Zhang 
2207fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2217fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
2227fd2d3dbSJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
2237fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2247fd2d3dbSJunchao Zhang }
2257fd2d3dbSJunchao Zhang 
226fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
227fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
228fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
229fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
230cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
231b7c0d12aSJunchao Zhang 
232cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
233cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
234cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
235cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
236cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
237cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
238cd620004SJunchao Zhang 
239cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
240cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
241cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
242cd620004SJunchao Zhang 
2437fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
2447fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
2457fd2d3dbSJunchao Zhang 
24620c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
24720c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF,PetscSFLink,MPI_Datatype);
24820c24465SJunchao Zhang #endif
24920c24465SJunchao Zhang 
250*59af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP)
251*59af0bd3SScott Kruger PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Hip(PetscSF,PetscSFLink,MPI_Datatype);
252*59af0bd3SScott Kruger #endif
253*59af0bd3SScott Kruger 
25420c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS)
25520c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype);
25620c24465SJunchao Zhang #endif
25720c24465SJunchao Zhang 
2587fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
2597fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
2607fd2d3dbSJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
2617fd2d3dbSJunchao Zhang    make sure the data is ready for packing.
2627fd2d3dbSJunchao Zhang  */
2637fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
2647fd2d3dbSJunchao Zhang {
2657fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2667fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2677fd2d3dbSJunchao Zhang   if (sf->use_default_stream) PetscFunctionReturn(0);
26820c24465SJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = (*link->d_SyncDevice)(link);CHKERRQ(ierr);}
2697fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2707fd2d3dbSJunchao Zhang }
2717fd2d3dbSJunchao Zhang 
2727fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
2737fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
2747fd2d3dbSJunchao Zhang {
2757fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2767fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
2777fd2d3dbSJunchao Zhang 
2787fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2797fd2d3dbSJunchao Zhang   /* Do nothing if we use stream aware mpi || has nothing for remote */
2807fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
2817fd2d3dbSJunchao Zhang   /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
28220c24465SJunchao Zhang   if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
2837fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2847fd2d3dbSJunchao Zhang }
2857fd2d3dbSJunchao Zhang 
2867fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
2877fd2d3dbSJunchao Zhang {
2887fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
2897fd2d3dbSJunchao Zhang   PetscFunctionBegin;
2907fd2d3dbSJunchao Zhang   /* See comments above */
2917fd2d3dbSJunchao Zhang   if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0);
29220c24465SJunchao Zhang   if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
2937fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
2947fd2d3dbSJunchao Zhang }
2957fd2d3dbSJunchao Zhang 
2967fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
2977fd2d3dbSJunchao Zhang    does not know which stream the callers will use.
2987fd2d3dbSJunchao Zhang */
2997fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
3007fd2d3dbSJunchao Zhang {
3017fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3027fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
3037fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
3047fd2d3dbSJunchao Zhang 
3057fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3067fd2d3dbSJunchao Zhang   /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */
3077fd2d3dbSJunchao Zhang   if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
3087fd2d3dbSJunchao Zhang 
3097fd2d3dbSJunchao Zhang   /* If rootmtype is HOST or DEVICE:
3107fd2d3dbSJunchao Zhang      If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it;
3117fd2d3dbSJunchao Zhang      If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect,
3127fd2d3dbSJunchao Zhang      i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync).
3137fd2d3dbSJunchao Zhang 
3147fd2d3dbSJunchao Zhang      Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise
3157fd2d3dbSJunchao Zhang      CPU thread might use the yet-to-be-updated rootdata pending in the stream.
3167fd2d3dbSJunchao Zhang    */
31720c24465SJunchao Zhang   if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
3187fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3197fd2d3dbSJunchao Zhang }
3207fd2d3dbSJunchao Zhang 
3217fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
3227fd2d3dbSJunchao Zhang {
3237fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3247fd2d3dbSJunchao Zhang   PetscBool      host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE;
3257fd2d3dbSJunchao Zhang 
3267fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3277fd2d3dbSJunchao Zhang   /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/
3287fd2d3dbSJunchao Zhang   if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0);
32920c24465SJunchao Zhang   if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);}
3307fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3317fd2d3dbSJunchao Zhang }
3327fd2d3dbSJunchao Zhang 
3337fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
3347fd2d3dbSJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
3357fd2d3dbSJunchao Zhang */
3367fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
3377fd2d3dbSJunchao Zhang {
3387fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3397fd2d3dbSJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
3407fd2d3dbSJunchao Zhang 
3417fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3427fd2d3dbSJunchao Zhang   if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
3437fd2d3dbSJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3447fd2d3dbSJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3457fd2d3dbSJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
3467fd2d3dbSJunchao Zhang     if (device2host) {
34720c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
3487fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
3497fd2d3dbSJunchao Zhang     } else {
35020c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
3517fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3527fd2d3dbSJunchao Zhang     }
3537fd2d3dbSJunchao Zhang   }
3547fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3557fd2d3dbSJunchao Zhang }
3567fd2d3dbSJunchao Zhang 
3577fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
3587fd2d3dbSJunchao Zhang {
3597fd2d3dbSJunchao Zhang   PetscErrorCode ierr;
3607fd2d3dbSJunchao Zhang 
3617fd2d3dbSJunchao Zhang   PetscFunctionBegin;
3627fd2d3dbSJunchao Zhang   if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
3637fd2d3dbSJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3647fd2d3dbSJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3657fd2d3dbSJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
3667fd2d3dbSJunchao Zhang     if (device2host) {
36720c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr);
3687fd2d3dbSJunchao Zhang       ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr);
3697fd2d3dbSJunchao Zhang     } else {
37020c24465SJunchao Zhang       ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr);
3717fd2d3dbSJunchao Zhang       ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr);
3727fd2d3dbSJunchao Zhang     }
3737fd2d3dbSJunchao Zhang   }
3747fd2d3dbSJunchao Zhang   PetscFunctionReturn(0);
3757fd2d3dbSJunchao Zhang }
3767fd2d3dbSJunchao Zhang 
3777fd2d3dbSJunchao Zhang #else /* Host only */
3787fd2d3dbSJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b)                0
3797fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b)             0
3807fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b)             0
3817fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b)           0
3827fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b)           0
3837fd2d3dbSJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
3847fd2d3dbSJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
3857fd2d3dbSJunchao Zhang #endif
386cd620004SJunchao Zhang 
387cd620004SJunchao Zhang /* Get root indices used for pack/unpack
388cd620004SJunchao Zhang 
389cd620004SJunchao Zhang Input arguments:
390cd620004SJunchao Zhang   +sf    - StarForest
391cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
392cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
393cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
394cd620004SJunchao Zhang 
395cd620004SJunchao Zhang  Output arguments:
396cd620004SJunchao Zhang   +count   - Count of indices
397cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
398cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
399cd620004SJunchao Zhang  */
400fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
401b7c0d12aSJunchao Zhang {
402cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
403cd620004SJunchao Zhang   PetscInt       offset;
404b7c0d12aSJunchao Zhang 
405b7c0d12aSJunchao Zhang   PetscFunctionBegin;
406fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
407fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
408fcc7397dSJunchao Zhang   *opt     = NULL;
409fcc7397dSJunchao Zhang   *indices = NULL;
410fcc7397dSJunchao Zhang 
411fcc7397dSJunchao Zhang   /* We have these rules:
412fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
413fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
414fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
415fcc7397dSJunchao Zhang   */
416fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
417cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
418fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
4197fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
420cd620004SJunchao Zhang     else {
421fcc7397dSJunchao Zhang       PetscErrorCode ierr;
422fcc7397dSJunchao Zhang       size_t         size;
423fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
424fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
425fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
426fcc7397dSJunchao Zhang           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
427fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
42820c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr);
42920c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr);
430fcc7397dSJunchao Zhang         }
431fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
432fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
433fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
434fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope]*sizeof(PetscInt);
43520c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr);
43620c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr);
437b7c0d12aSJunchao Zhang         }
438cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
439cd620004SJunchao Zhang       }
440cd620004SJunchao Zhang     }
441fcc7397dSJunchao Zhang #endif
442cd620004SJunchao Zhang   }
443b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
444b7c0d12aSJunchao Zhang }
445b7c0d12aSJunchao Zhang 
446cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
447cd620004SJunchao Zhang 
448fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
449cd620004SJunchao Zhang  */
450fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
451cd620004SJunchao Zhang {
452cd620004SJunchao Zhang   PetscInt   offset;
453cd620004SJunchao Zhang 
454cd620004SJunchao Zhang   PetscFunctionBegin;
455fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
456fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
457fcc7397dSJunchao Zhang   *opt     = NULL;
458fcc7397dSJunchao Zhang   *indices = NULL;
459fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
460cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
461fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
4627fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
463cd620004SJunchao Zhang     else {
464fcc7397dSJunchao Zhang       PetscErrorCode ierr;
465fcc7397dSJunchao Zhang       size_t         size;
466fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
467fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
468fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
469fcc7397dSJunchao Zhang           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
470fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
47120c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */
47220c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr);
473fcc7397dSJunchao Zhang         }
474fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
475fcc7397dSJunchao Zhang       } else {
476fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
477fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope]*sizeof(PetscInt);
47820c24465SJunchao Zhang           ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr);
47920c24465SJunchao Zhang           ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr);
480cd620004SJunchao Zhang         }
481cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
482cd620004SJunchao Zhang       }
483cd620004SJunchao Zhang     }
484fcc7397dSJunchao Zhang #endif
485cd620004SJunchao Zhang   }
486cd620004SJunchao Zhang   PetscFunctionReturn(0);
487cd620004SJunchao Zhang }
48840e23c03SJunchao Zhang #endif
489