xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 00816365229eff0fcb3567887feb4c4abe2b0e86)
1 #if !defined(__SFPACK_H)
2 #define __SFPACK_H
3 
4 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5 
6 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
7 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
8 
9 /* Optimization plans in packing & unpacking for destination ranks.
10 
11   Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing:
12      p[i] = u[idx[i]], for i in [0,count)
13 
14   Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[].
15   We analyze indices for each rank and see if they are patterns that can be used to optimize the packing.
16   The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into
17   a small number of contiguous memory copies or one strided memory copy.
18 
19   Note before using this per-rank optimization, one should check rleafloccontig, irootloccontig, which say
20   indices in whole start from 0 and are contiguous, and therefore much useful when true.
21  */
22 typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType;
23 struct _n_PetscSFPackOpt {
24   PetscInt           n;             /* Number of destination ranks */
25   PetscSFPackOptType *type;         /* [n] Optimization types for the n ranks */
26   PetscInt           *offset;       /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */
27   PetscInt           *copy_offset;  /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */
28   PetscInt           *copy_start;   /* [*]     j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */
29   PetscInt           *copy_length;  /* [*]     starting at idx[copy_start[j]] */
30   PetscInt           *stride_step;  /* [n]   If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */
31   PetscInt           *stride_n;     /* [n]     and total stride_n[i] steps */
32 };
33 
34 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
35  */
36 struct _n_PetscSFLink {                /* link,    count,  start,   indices,      pack plan, root/leafdata, buffer*/
37   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
38   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
39   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
40   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
41   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
42   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
43   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
44   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
45   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
46   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
47   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
48   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
49   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
50   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
51   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
52                                       /*     link,   count, startx,            idx,      xdata,  starty,         idy,   ydata */
53   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
54   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
55   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
56   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
57   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
58   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
59   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
60   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
61   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
62   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
63   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
64   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
65   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
66                                        /* link, count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate*/
67   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
68 
69   PetscBool      deviceinited;        /* Are device related fields initialized? */
70 #if defined(PETSC_HAVE_CUDA)
71   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
72      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
73      regular ops when there are no data race chances.
74   */
75   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
76 
77   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
78   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
79   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
80   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
81   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
82   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
83   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
84   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
85   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
86   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
87   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
88   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
89   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
90   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
91 
92   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
93   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
94   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
95   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
96   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
97   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
98   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
99   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
100   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
101   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
102   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
103   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
104   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
105   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
106 
107   /* Packing routines using atomics when there are data race chances */
108   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
109   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
110   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
111   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
112   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
113   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
114   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
115   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
116   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
117   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
118   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
119   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
120   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
121   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
122 
123   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
124   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
125   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
126   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
127   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
128   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
129   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
130   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
131   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
132   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
133   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
134   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
135   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
136   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
137 
138   PetscInt     maxResidentThreadsPerGPU;     /* It is a copy from SF for convenience */
139   cudaStream_t stream;                       /* Stream to launch pack/unapck kernels if not using the default stream */
140 #endif
141   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
142   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
143   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
144   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
145   size_t       unitbytes;                    /* Number of bytes in a unit */
146   PetscInt     bs;                           /* Number of basic units in a unit */
147   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
148   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
149 
150   /* For local and remote communication */
151   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
152   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
153   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
154   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
155   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
156   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
157   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
158   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
159   char         *leafbuf_alloc[2][2];
160   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
161   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
162   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
163   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
164   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
165   PetscSFLink  next;
166 };
167 
168 #if defined(PETSC_USE_DEBUG)
169 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
170 #else
171 #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0
172 #endif
173 
174 /* Create/setup/retrieve/destroy a link */
175 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
176 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
177 #if defined(PETSC_HAVE_CUDA)
178 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
179 #else
180 #define PetscSFLinkSetUp_Device(a,b,c) 0
181 #endif
182 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
183 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
184 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
185 
186 /* Get pack/unpack function pointers from a link */
187 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*))
188 {
189   PetscFunctionBegin;
190   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
191 #if defined(PETSC_HAVE_CUDA)
192   else *Pack = link->d_Pack;
193 #endif
194   PetscFunctionReturn(0);
195 }
196 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*));
197 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*));
198 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*));
199 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*));
200 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
201 
202 /* Do Pack/Unpack/Fetch/Scatter with the link */
203 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
204 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
205 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
206 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
207 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
208 
209 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
210 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
211 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
212 
213 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf);
214 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf);
215 PETSC_INTERN PetscErrorCode PetscSFCreatePackOpt(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*);
216 PETSC_INTERN PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out);
217 
218 
219 /* Get root indices used for pack/unpack
220 
221 Input arguments:
222   +sf    - StarForest
223   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
224   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
225   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
226 
227  Output arguments:
228   +count   - Count of indices
229   .start   - The first index (only useful when indices is NULL)
230   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
231  */
232 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
233 {
234   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
235   PetscInt       offset;
236 
237   PetscFunctionBegin;
238   if (count) *count = bas->rootbuflen[scope];
239   if (start) *start = bas->rootstart[scope];
240   if (indices) {
241     if (bas->rootcontig[scope]) *indices = NULL;
242     else {
243       offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
244       if (mtype == PETSC_MEMTYPE_HOST) {*indices = bas->irootloc + offset;}
245 #if defined(PETSC_HAVE_CUDA)
246       else {
247         if (!bas->irootloc_d[scope]) {
248           cudaError_t    cerr;
249           size_t         size = bas->rootbuflen[scope]*sizeof(PetscInt);
250           cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr);
251           cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
252         }
253         *indices = bas->irootloc_d[scope];
254       }
255 #endif
256     }
257   }
258   PetscFunctionReturn(0);
259 }
260 
261 /* Get leaf indices used for pack/unpack
262 
263   See also PetscSFLinkGetRootIndices()
264  */
265 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
266 {
267   PetscInt   offset;
268 
269   PetscFunctionBegin;
270   if (count) *count = sf->leafbuflen[scope];
271   if (start) *start = sf->leafstart[scope];
272   if (indices) {
273     if (sf->leafcontig[scope]) *indices = NULL;
274     else {
275       offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
276       if (mtype == PETSC_MEMTYPE_HOST) {*indices = sf->rmine + offset;}
277   #if defined(PETSC_HAVE_CUDA)
278       else {
279         if (!sf->rmine_d[scope]) {
280           cudaError_t  cerr;
281           size_t       size = sf->leafbuflen[scope]*sizeof(PetscInt);
282           cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr);
283           cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
284         }
285         *indices = sf->rmine_d[scope];
286       }
287   #endif
288     }
289   }
290   PetscFunctionReturn(0);
291 }
292 
293 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
294 {
295   PetscErrorCode       ierr;
296   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
297   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
298   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
299 
300   PetscFunctionBegin;
301   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
302   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
303   PetscFunctionReturn(0);
304 }
305 
306 #endif
307