xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision e07844bfbc769860c635334c99b15173da318563)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
440e23c03SJunchao Zhang #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/
540e23c03SJunchao Zhang 
6b23bfdefSJunchao Zhang /* Optimization plans in packing & unpacking for destination ranks.
740e23c03SJunchao Zhang 
8b23bfdefSJunchao Zhang   Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing:
9b23bfdefSJunchao Zhang      p[i] = u[idx[i]], for i in [0,count)
1040e23c03SJunchao Zhang 
11b23bfdefSJunchao Zhang   Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[].
12b23bfdefSJunchao Zhang   We analyze indices for each rank and see if they are patterns that can be used to optimize the packing.
13eb02082bSJunchao Zhang   The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into
14eb02082bSJunchao Zhang   a small number of contiguous memory copies or one strided memory copy.
1540e23c03SJunchao Zhang  */
16b23bfdefSJunchao Zhang typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType;
17b23bfdefSJunchao Zhang 
1840e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
19b23bfdefSJunchao Zhang   PetscInt           n;             /* Number of destination ranks */
20b23bfdefSJunchao Zhang   PetscSFPackOptType *type;         /* [n] Optimization types for the n ranks */
21b23bfdefSJunchao Zhang   PetscInt           *offset;       /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */
22b23bfdefSJunchao Zhang   PetscInt           *copy_offset;  /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */
23b23bfdefSJunchao Zhang   PetscInt           *copy_start;   /* [*]     j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */
24eb02082bSJunchao Zhang   PetscInt           *copy_length;  /* [*]     starting at idx[copy_start[j]] */
25b23bfdefSJunchao Zhang   PetscInt           *stride_step;  /* [n]   If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */
2640e23c03SJunchao Zhang   PetscInt           *stride_n;     /* [n]     and total stride_n[i] steps */
2740e23c03SJunchao Zhang };
2840e23c03SJunchao Zhang 
2940e23c03SJunchao Zhang typedef struct _n_PetscSFPack* PetscSFPack;
3040e23c03SJunchao Zhang 
31eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
3240e23c03SJunchao Zhang  */
3340e23c03SJunchao Zhang struct _n_PetscSFPack {
34eb02082bSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
35eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
36eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
37eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
38eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
39eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
40eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
41eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
42eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
43eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
44eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
45eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
46eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
47eb02082bSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
48eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndInsert)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
49eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
50eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndMin)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
51eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndMax)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
52eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndMinloc)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
53eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndMaxloc)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
54eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndMult)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
55eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndLAND)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
56eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndBAND)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
57eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndLOR)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
58eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndBOR)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
59eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndLXOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
60eb02082bSJunchao Zhang   PetscErrorCode (*h_FetchAndBXOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
61eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
62eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
63eb02082bSJunchao Zhang      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
64eb02082bSJunchao Zhang      regular ops when there are no data race chances.
65eb02082bSJunchao Zhang   */
66eb02082bSJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
67eb02082bSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*);
68eb02082bSJunchao Zhang 
69eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
70eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
71eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
72eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
73eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
74eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
75eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
76eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
77eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
78eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
79eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
80eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
81eb02082bSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
82eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndInsert)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
83eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
84eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndMin)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
85eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndMax)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
86eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndMinloc)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
87eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndMaxloc)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
88eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndMult)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
89eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndLAND)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
90eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndBAND)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
91eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndLOR)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
92eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndBOR)     (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
93eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndLXOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
94eb02082bSJunchao Zhang   PetscErrorCode (*d_FetchAndBXOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
95eb02082bSJunchao Zhang 
96eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
97eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
98eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
99eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
100eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
101eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
102eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
103eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
104eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
105eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
106eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
107eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
108eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
109eb02082bSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*);
110eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
111eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
112eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndMin)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
113eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndMax)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
114eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
115eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
116eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndMult)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
117eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndLAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
118eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndBAND)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
119eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndLOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
120eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndBOR)    (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
121eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndLXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
122eb02082bSJunchao Zhang   PetscErrorCode (*da_FetchAndBXOR)   (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,      void*);
123eb02082bSJunchao Zhang 
124eb02082bSJunchao Zhang   PetscInt       MAX_CORESIDENT_THREADS; /* It is a copy from SF for convenience. */
125eb02082bSJunchao Zhang   cudaStream_t   stream;                 /* Stream to launch pack/unapck kernels if not using the default stream */
126eb02082bSJunchao Zhang #endif
127eb02082bSJunchao Zhang   PetscMPIInt    tag;                    /* Each link has a tag so we can perform multiple SF ops at the same time */
1285ad15460SJunchao Zhang   MPI_Datatype   unit;                   /* The MPI datatype this PetscSFPack is built for */
129eb02082bSJunchao Zhang   MPI_Datatype   basicunit;              /* unit is made of MPI builtin dataype basicunit */
130*e07844bfSJunchao Zhang   PetscBool      isbuiltin;              /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
131eb02082bSJunchao Zhang   size_t         unitbytes;              /* Number of bytes in a unit */
132eb02082bSJunchao Zhang   PetscInt       bs;                     /* Number of basic units in a unit */
133637e6665SJunchao Zhang   const void     *rootdata,*leafdata;    /* rootdata and leafdata used as keys for operation */
134eb02082bSJunchao Zhang   char           *rootbuf[2];            /* Buffer for packed roots on Host (0 or PETSC_MEMTYPE_HOST) or Device (1 or PETSC_MEMTYPE_DEVICE) */
135eb02082bSJunchao Zhang   char           *leafbuf[2];            /* Buffer for packed leaves on Host (0) or Device (1) */
136eb02082bSJunchao Zhang   char           *selfbuf[2];            /* Buffer for roots in self to self communication on Host (0) or Device (1) */
137eb02082bSJunchao Zhang   PetscInt       rootbuflen;             /* Length of root buffer in <unit> */
138eb02082bSJunchao Zhang   PetscInt       leafbuflen;             /* Length of leaf buffer in <unit> */
139eb02082bSJunchao Zhang   PetscInt       selfbuflen;             /* Length of self buffer in <unit> */
140eb02082bSJunchao Zhang   PetscMemType   rootmtype;              /* rootdata's memory type */
141eb02082bSJunchao Zhang   PetscMemType   leafmtype;              /* leafdata's memory type */
142eb02082bSJunchao Zhang   PetscMPIInt    nrootreqs;              /* Number of root requests */
143eb02082bSJunchao Zhang   PetscMPIInt    nleafreqs;              /* Number of leaf requests */
144eb02082bSJunchao Zhang   MPI_Request    *rootreqs[2][2];        /* Pointers to root requests in this layout [PETSCSF_DIRECTION][PETSC_MEMTYPE] */
145eb02082bSJunchao Zhang   MPI_Request    *leafreqs[2][2];        /* Pointers to leaf requests in this layout [PETSCSF_DIRECTION][PETSC_MEMTYPE] */
146eb02082bSJunchao Zhang   PetscBool      rootreqsinited[2][2];   /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]*/
147eb02082bSJunchao Zhang   PetscBool      leafreqsinited[2][2];   /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]*/
148eb02082bSJunchao Zhang   MPI_Request    *reqs;                  /* An array of length (nrootreqs+nleafreqs)*4. Pointers in rootreqs[][] and leafreqs[][] point here */
149eb02082bSJunchao Zhang   PetscSFPack    next;
15040e23c03SJunchao Zhang };
15140e23c03SJunchao Zhang 
1529d1c8addSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFPack*);
15340e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackReclaim(PetscSF,PetscSFPack*);
15464f49babSJed Brown PETSC_INTERN PetscErrorCode PetscSFPackDestroyAvailable(PetscSFPack*);
155b7c0d12aSJunchao Zhang 
156eb02082bSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackGetPack(PetscSFPack link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*))
157eb02082bSJunchao Zhang {
158eb02082bSJunchao Zhang   PetscFunctionBegin;
159eb02082bSJunchao Zhang   *Pack = NULL;
160eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST)        *Pack = link->h_Pack;
161eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
162eb02082bSJunchao Zhang   else if (mtype == PETSC_MEMTYPE_DEVICE) *Pack = link->d_Pack;
163eb02082bSJunchao Zhang #endif
164b7c0d12aSJunchao Zhang   else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype);
165eb02082bSJunchao Zhang   PetscFunctionReturn(0);
166eb02082bSJunchao Zhang }
167eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*));
168eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetFetchAndOp (PetscSFPack,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*));
1699d1c8addSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
170b7c0d12aSJunchao Zhang 
171b7c0d12aSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall(PetscSFPack link,PetscSFDirection direction)
172b7c0d12aSJunchao Zhang {
173b7c0d12aSJunchao Zhang   PetscErrorCode ierr;
174b7c0d12aSJunchao Zhang   PetscMemType   rootmtype,leafmtype;
175b7c0d12aSJunchao Zhang 
176b7c0d12aSJunchao Zhang   PetscFunctionBegin;
177b7c0d12aSJunchao Zhang   if (use_gpu_aware_mpi) {
178b7c0d12aSJunchao Zhang     rootmtype = link->rootmtype;
179b7c0d12aSJunchao Zhang     leafmtype = link->leafmtype;
180b7c0d12aSJunchao Zhang   } else {
181b7c0d12aSJunchao Zhang     rootmtype = PETSC_MEMTYPE_HOST;
182b7c0d12aSJunchao Zhang     leafmtype = PETSC_MEMTYPE_HOST;
183b7c0d12aSJunchao Zhang   }
184b7c0d12aSJunchao Zhang   ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
185b7c0d12aSJunchao Zhang   ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
186b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
187b7c0d12aSJunchao Zhang }
188b7c0d12aSJunchao Zhang 
189eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetUp_Host(PetscSF,PetscSFPack,MPI_Datatype);
190eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
191eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetUp_Device(PetscSF,PetscSFPack,MPI_Datatype);
192eb02082bSJunchao Zhang #endif
193eb02082bSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackOptCreate(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*);
19464f49babSJed Brown PETSC_INTERN PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out);
19540e23c03SJunchao Zhang #endif
196