xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 9140fee14acbea959c6ed74f4836a1a89f708038)
1 #pragma once
2 
3 #include <../src/vec/is/sf/impls/basic/sfbasic.h>
4 #if defined(PETSC_HAVE_CUDA)
5   #include <petscdevice_cuda.h>
6 typedef cudaStream_t cupmStream_t;
7 typedef cudaEvent_t  cupmEvent_t;
8 #endif
9 
10 #if defined(PETSC_HAVE_HIP)
11   #include <petscdevice_hip.h>
12 typedef hipStream_t cupmStream_t;
13 typedef hipEvent_t  cupmEvent_t;
14 #endif
15 
16 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
17    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
18  */
19 typedef long long int          llint;
20 typedef unsigned long long int ullint;
21 
22 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
23 typedef enum {
24   PETSCSF_LOCAL = 0,
25   PETSCSF_REMOTE
26 } PetscSFScope;
27 
28 /* Optimizations in packing & unpacking for destination ranks.
29 
30   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
31      p[i] = u[idx[i]], for i in [0,m)
32 
33   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
34   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
35   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
36 
37   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
38   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
39 
40   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
41   indices in whole are contiguous, and therefore much more useful than this one when true.
42  */
43 struct _n_PetscSFPackOpt {
44   PetscInt *array;        /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
45   PetscInt  n;            /* Number of destination ranks */
46   PetscInt *offset;       /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
47   PetscInt *start;        /* [n] First index */
48   PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
49   PetscInt *X, *Y;        /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
50 };
51 
52 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
53  */
54 struct _n_PetscSFLink {
55   PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */
56   PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection);
57   PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection);
58   PetscErrorCode (*InitMPIRequests)(PetscSF, PetscSFLink, PetscSFDirection); // init (persistent) MPI requests
59   PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
60   PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
61   PetscErrorCode (*SyncDevice)(PetscSFLink);
62   PetscErrorCode (*SyncStream)(PetscSFLink);
63   PetscErrorCode (*Destroy)(PetscSF, PetscSFLink);
64 
65   PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection);
66   PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection);
67 
68   PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
69   PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
70   PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
71   PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
72   PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
73   PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
74   PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
75   PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
76   PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
77   PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
78   PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
79   PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
80   PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
81   PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
82   PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
83 
84   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
85   PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
86   PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
87   PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
88   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
89   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
90   PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
91   PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
92   PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
93   PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
94   PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
95   PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
96   PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
97 
98   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
99 
100   PetscBool deviceinited; /* Are device related fields initialized? */
101 #if defined(PETSC_HAVE_DEVICE)
102   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
103      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
104   */
105   PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
106   PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
107   PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
108   PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
109   PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
110   PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
111   PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
112   PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
113   PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
114   PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
115   PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
116   PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
117   PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
118   PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
119   PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
120 
121   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
122   PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
123   PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
124   PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
125   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
126   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
127   PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
128   PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
129   PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
130   PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
131   PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
132   PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
133   PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
134   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
135 
136   /* Packing routines using atomics when there are data race chances */
137   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
138   PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
139   PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
140   PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
141   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
142   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
143   PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
144   PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
145   PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
146   PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
147   PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
148   PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
149   PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
150   PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
151 
152   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
153   PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
154   PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
155   PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
156   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
157   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
158   PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
159   PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
160   PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
161   PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
162   PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
163   PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
164   PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
165   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
166   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
167   PetscInt     maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */
168   cupmStream_t stream;                   /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */
169   #endif
170 #endif
171   PetscMPIInt  tag;                  /* Each link has a tag so we can perform multiple SF ops at the same time */
172   MPI_Datatype unit;                 /* The MPI datatype this PetscSFLink is built for */
173   MPI_Datatype basicunit;            /* unit is made of MPI builtin dataype basicunit */
174   PetscBool    isbuiltin;            /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
175   size_t       unitbytes;            /* Number of bytes in a unit */
176   PetscInt     bs;                   /* Number of basic units in a unit */
177   const void  *rootdata, *leafdata;  /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
178   PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */
179 
180   /* For local and remote communication */
181   PetscMemType rootmtype_mpi, leafmtype_mpi;   /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
182   PetscBool    rootdirect[2], leafdirect[2];   /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
183   PetscInt     rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
184   const void  *rootdatadirect[2][2];           /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
185   const void  *leafdatadirect[2][2];           /* ... We need them to look up links when root/leafdirect_mpi are true */
186   char        *rootbuf[2][2];                  /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */
187                                                /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */
188   char        *rootbuf_alloc[2][2];            /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
189   char        *leafbuf[2][2];                  /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
190   char        *leafbuf_alloc[2][2];
191   MPI_Request *rootreqs[2][2][2];       /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
192   MPI_Request *leafreqs[2][2][2];       /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
193   PetscBool    rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
194   PetscBool    leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
195   MPI_Request *reqs;                    /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
196   PetscSFLink  next;
197 
198   PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */
199 #if defined(PETSC_HAVE_NVSHMEM)
200   cupmEvent_t  dataReady;        /* Events to mark readiness of root/leafdata */
201   cupmEvent_t  endRemoteComm;    /* Events to mark end of local/remote communication */
202   cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */
203 
204   /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */
205   uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */
206   uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */
207 #endif
208 };
209 
210 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *);
211 
212 /* Create/setup/retrieve/destroy a link */
213 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
214 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype);
215 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *);
216 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *);
217 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink);
218 
219 /* Get pack/unpack function pointers from a link */
220 static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
221 {
222   PetscFunctionBegin;
223   if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack;
224 #if defined(PETSC_HAVE_DEVICE)
225   else *Pack = link->d_Pack;
226 #endif
227   PetscFunctionReturn(PETSC_SUCCESS);
228 }
229 
230 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *));
231 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *));
232 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *));
233 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *));
234 
235 /* Do Pack/Unpack/Fetch/Scatter with the link */
236 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *);
237 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *);
238 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
239 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
240 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op);
241 
242 PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op);
243 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op);
244 
245 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
246 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
247 PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
248 
249 #if defined(PETSC_HAVE_CUDA)
250 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype);
251 #endif
252 
253 #if defined(PETSC_HAVE_HIP)
254 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype);
255 #endif
256 
257 #if defined(PETSC_HAVE_KOKKOS)
258 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype);
259 #endif
260 
261 #if defined(PETSC_HAVE_NVSHMEM)
262 PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
263 PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *);
264 #endif
265 
266 static inline PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
267 {
268   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* memtype of buffers passed to MPI */
269   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
270 
271   PetscFunctionBegin;
272   if (link->InitMPIRequests) PetscCall((*link->InitMPIRequests)(sf, link, direction)); // init (persistent) MPI requests
273 
274   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
275   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
276   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
277   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
278   PetscFunctionReturn(PETSC_SUCCESS);
279 }
280 
281 static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
282 {
283   PetscFunctionBegin;
284   if (link->StartCommunication) PetscCall((*link->StartCommunication)(sf, link, direction));
285   PetscFunctionReturn(PETSC_SUCCESS);
286 }
287 
288 static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
289 {
290   PetscFunctionBegin;
291   if (link->FinishCommunication) PetscCall((*link->FinishCommunication)(sf, link, direction));
292   PetscFunctionReturn(PETSC_SUCCESS);
293 }
294 
295 /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
296 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL)
297 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
298    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
299 */
300 static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
301 {
302   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
303 
304   PetscFunctionBegin;
305   /* rootdata is on device but we use regular MPI for communication */
306   if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) {
307     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
308     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
309     size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes;
310     if (device2host) {
311       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
312       PetscCall(PetscLogGpuToCpu(count));
313     } else {
314       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
315       PetscCall(PetscLogCpuToGpu(count));
316     }
317   }
318   PetscFunctionReturn(PETSC_SUCCESS);
319 }
320 
321 static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
322 {
323   PetscFunctionBegin;
324   if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) {
325     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
326     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
327     size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes;
328     if (device2host) {
329       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
330       PetscCall(PetscLogGpuToCpu(count));
331     } else {
332       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
333       PetscCall(PetscLogCpuToGpu(count));
334     }
335   }
336   PetscFunctionReturn(PETSC_SUCCESS);
337 }
338 
339 /* Make sure root/leafbuf for the remote is ready for MPI */
340 static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
341 {
342   PetscSF_Basic *bas;
343   PetscInt       buflen;
344   PetscMemType   mtype;
345 
346   PetscFunctionBegin;
347   if (direction == PETSCSF_ROOT2LEAF) {
348     bas    = (PetscSF_Basic *)sf->data;
349     mtype  = link->rootmtype;
350     buflen = bas->rootbuflen[PETSCSF_REMOTE];
351   } else {
352     mtype  = link->leafmtype;
353     buflen = sf->leafbuflen[PETSCSF_REMOTE];
354   }
355 
356   if (PetscMemTypeDevice(mtype) && buflen) PetscCall((*link->SyncStream)(link));
357   PetscFunctionReturn(PETSC_SUCCESS);
358 }
359 #else /* Host only */
360   #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
361   #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
362   #define PetscSFLinkSyncStreamBeforeCallMPI(a, b, c)               PETSC_SUCCESS
363 #endif
364 
365 /* Get root indices used for pack/unpack
366 
367 Input arguments:
368   +sf    - StarForest
369   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
370   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
371   -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
372 
373  Output arguments:
374   +count   - Count of indices
375   .start   - The first index (only useful when indices is NULL)
376   .opt     - Packing optimizations
377   -indices - Indices of roots for pack/unpack. NULL means indices are contiguous
378  */
379 static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
380 {
381   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
382   PetscInt       offset;
383 
384   PetscFunctionBegin;
385   *count   = bas->rootbuflen[scope];
386   *start   = bas->rootstart[scope];
387   *opt     = NULL;
388   *indices = NULL;
389 
390   /* We have these rules:
391     1) opt == NULL && indices == NULL ==> indices are contiguous.
392     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
393        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
394   */
395   if (!bas->rootcontig[scope]) {
396     offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks];
397     if (PetscMemTypeHost(mtype)) {
398       *opt     = bas->rootpackopt[scope];
399       *indices = bas->irootloc + offset;
400     } else {
401       size_t size;
402       if (bas->rootpackopt[scope]) {
403         if (!bas->rootpackopt_d[scope]) {
404           PetscCall(PetscMalloc1(1, &bas->rootpackopt_d[scope]));
405           PetscCall(PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1)); /* Make pointers in bas->rootpackopt_d[] still work on host */
406           size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                  /* See comments at struct _n_PetscSFPackOpt*/
407           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array));
408           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size));
409         }
410         *opt = bas->rootpackopt_d[scope];
411       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
412         if (!bas->irootloc_d[scope]) {
413           size = bas->rootbuflen[scope] * sizeof(PetscInt);
414           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope]));
415           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size));
416         }
417         *indices = bas->irootloc_d[scope];
418       }
419     }
420   }
421   PetscFunctionReturn(PETSC_SUCCESS);
422 }
423 
424 /* Get leaf indices used for pack/unpack
425 
426   See also PetscSFLinkGetRootPackOptAndIndices()
427  */
428 static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
429 {
430   PetscInt offset;
431 
432   PetscFunctionBegin;
433   *count   = sf->leafbuflen[scope];
434   *start   = sf->leafstart[scope];
435   *opt     = NULL;
436   *indices = NULL;
437   if (!sf->leafcontig[scope]) {
438     offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks];
439     if (PetscMemTypeHost(mtype)) {
440       *opt     = sf->leafpackopt[scope];
441       *indices = sf->rmine + offset;
442     } else {
443       size_t size;
444       if (sf->leafpackopt[scope]) {
445         if (!sf->leafpackopt_d[scope]) {
446           PetscCall(PetscMalloc1(1, &sf->leafpackopt_d[scope]));
447           PetscCall(PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1));
448           size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                                       /* See comments at struct _n_PetscSFPackOpt*/
449           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array)); /* Change ->array to a device pointer */
450           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size));
451         }
452         *opt = sf->leafpackopt_d[scope];
453       } else {
454         if (!sf->rmine_d[scope]) {
455           size = sf->leafbuflen[scope] * sizeof(PetscInt);
456           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope]));
457           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size));
458         }
459         *indices = sf->rmine_d[scope];
460       }
461     }
462   }
463   PetscFunctionReturn(PETSC_SUCCESS);
464 }
465