xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 2da392cc7c10228af19ad9843ce5155178acb644)
1 #if !defined(__SFPACK_H)
2 #define __SFPACK_H
3 
4 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5 
6 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
7 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
8 
9 /* Optimizations in packing & unpacking for destination ranks.
10 
11   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
12      p[i] = u[idx[i]], for i in [0,m)
13 
14   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
15   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
16   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
17 
18   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
19   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
20 
21   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
22   indices in whole are contiguous, and therefore much more useful than this one when true.
23  */
24 struct _n_PetscSFPackOpt {
25   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
26   PetscInt       n;           /* Number of destination ranks */
27   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
28   PetscInt       *start;      /* [n] First index */
29   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
30   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
31 };
32 
33 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
34  */
35 struct _n_PetscSFLink {
36   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
37   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
38   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
39   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
40   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
41   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
42   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
43   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
44   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
45   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
46   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
47   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
48   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
49   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
50   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
51 
52   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
53   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
54   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
55   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
56   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
57   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
58   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
59   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
60   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
61   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
62   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
63   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
64   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
65 
66   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
67 
68   PetscBool      deviceinited;        /* Are device related fields initialized? */
69 #if defined(PETSC_HAVE_CUDA)
70   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
71      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
72      regular ops when there are no data race chances.
73   */
74   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
75   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
76   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
77   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
78   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
79   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
80   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
81   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
82   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
83   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
84   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
85   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
86   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
87   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
88   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
89 
90   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
91   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
92   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
93   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
94   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
95   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
96   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
97   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
98   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
99   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
100   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
101   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
102   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
103   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
104 
105   /* Packing routines using atomics when there are data race chances */
106   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
107   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
108   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
109   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
110   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
111   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
112   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
113   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
114   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
115   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
116   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
117   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
118   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
119   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
120 
121   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
122   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
123   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
124   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
125   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
126   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
127   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
128   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
129   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
130   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
131   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
132   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
133   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
134   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
135 
136   PetscInt     maxResidentThreadsPerGPU;     /* It is a copy from SF for convenience */
137   cudaStream_t stream;                       /* Stream to launch pack/unapck kernels if not using the default stream */
138 #endif
139   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
140   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
141   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
142   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
143   size_t       unitbytes;                    /* Number of bytes in a unit */
144   PetscInt     bs;                           /* Number of basic units in a unit */
145   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
146   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
147 
148   /* For local and remote communication */
149   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
150   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
151   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
152   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
153   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
154   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
155   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
156   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
157   char         *leafbuf_alloc[2][2];
158   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
159   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
160   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
161   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
162   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
163   PetscSFLink  next;
164 };
165 
166 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
167 
168 /* Create/setup/retrieve/destroy a link */
169 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
170 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
171 #if defined(PETSC_HAVE_CUDA)
172 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
173 #else
174 #define PetscSFLinkSetUp_Device(a,b,c) 0
175 #endif
176 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
177 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
178 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
179 
180 /* Get pack/unpack function pointers from a link */
181 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
182 {
183   PetscFunctionBegin;
184   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
185 #if defined(PETSC_HAVE_CUDA)
186   else *Pack = link->d_Pack;
187 #endif
188   PetscFunctionReturn(0);
189 }
190 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
191 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
192 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
193 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
194 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
195 
196 /* Do Pack/Unpack/Fetch/Scatter with the link */
197 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
198 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
199 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
200 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
201 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
202 
203 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
204 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
205 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
206 
207 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf);
208 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf);
209 
210 /* Get root indices used for pack/unpack
211 
212 Input arguments:
213   +sf    - StarForest
214   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
215   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
216   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
217 
218  Output arguments:
219   +count   - Count of indices
220   .start   - The first index (only useful when indices is NULL)
221   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
222  */
223 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
224 {
225   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
226   PetscInt       offset;
227 
228   PetscFunctionBegin;
229   *count   = bas->rootbuflen[scope];
230   *start   = bas->rootstart[scope];
231   *opt     = NULL;
232   *indices = NULL;
233 
234   /* We have these rules:
235     1) opt == NULL && indices == NULL ==> indices are contiguous.
236     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
237        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
238   */
239   if (!bas->rootcontig[scope]) {
240     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
241     if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
242 #if defined(PETSC_HAVE_CUDA)
243     else {
244       PetscErrorCode ierr;
245       cudaError_t    cerr;
246       size_t         size;
247       if (bas->rootpackopt[scope]) {
248         if (!bas->rootpackopt_d[scope]) {
249           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
250           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
251           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
252           cerr = cudaMalloc((void **)&bas->rootpackopt_d[scope]->array,size);CHKERRCUDA(cerr);
253           cerr = cudaMemcpyAsync(bas->rootpackopt_d[scope]->array,bas->rootpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
254         }
255         *opt = bas->rootpackopt_d[scope];
256       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
257         if (!bas->irootloc_d[scope]) {
258           size = bas->rootbuflen[scope]*sizeof(PetscInt);
259           cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr);
260           cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
261         }
262         *indices = bas->irootloc_d[scope];
263       }
264     }
265 #endif
266   }
267   PetscFunctionReturn(0);
268 }
269 
270 /* Get leaf indices used for pack/unpack
271 
272   See also PetscSFLinkGetRootPackOptAndIndices()
273  */
274 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
275 {
276   PetscInt   offset;
277 
278   PetscFunctionBegin;
279   *count   = sf->leafbuflen[scope];
280   *start   = sf->leafstart[scope];
281   *opt     = NULL;
282   *indices = NULL;
283   if (!sf->leafcontig[scope]) {
284     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
285     if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
286 #if defined(PETSC_HAVE_CUDA)
287     else {
288       PetscErrorCode ierr;
289       cudaError_t    cerr;
290       size_t         size;
291       if (sf->leafpackopt[scope]) {
292         if (!sf->leafpackopt_d[scope]) {
293           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
294           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
295           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
296           cerr = cudaMalloc((void **)&sf->leafpackopt_d[scope]->array,size);CHKERRCUDA(cerr); /* Change ->array to a device pointer */
297           cerr = cudaMemcpyAsync(sf->leafpackopt_d[scope]->array,sf->leafpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
298         }
299         *opt = sf->leafpackopt_d[scope];
300       } else {
301         if (!sf->rmine_d[scope]) {
302           size = sf->leafbuflen[scope]*sizeof(PetscInt);
303           cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr);
304           cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
305         }
306         *indices = sf->rmine_d[scope];
307       }
308     }
309 #endif
310   }
311   PetscFunctionReturn(0);
312 }
313 
314 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
315 {
316   PetscErrorCode       ierr;
317   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
318   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
319   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
320 
321   PetscFunctionBegin;
322   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
323   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
324   PetscFunctionReturn(0);
325 }
326 
327 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
328 {
329   PetscFunctionBegin;
330   if (n) {
331     if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
332 #if defined(PETSC_HAVE_CUDA)
333     else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_HOST)   {
334       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
335       PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr);
336     } else if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_DEVICE) {
337       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);
338       PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr);
339     } else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(err);}
340 #endif
341     else SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType for dst %d and src %d",(int)dstmtype,(int)srcmtype);
342   }
343   PetscFunctionReturn(0);
344 }
345 #endif
346