xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision 872ab141943dee616010b2ad741aa550bd53b329)
1a4963045SJacob Faibussowitsch #pragma once
240e23c03SJunchao Zhang 
3cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
47fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
50e6b6b59SJacob Faibussowitsch   #include <petscdevice_cuda.h>
671438e86SJunchao Zhang typedef cudaStream_t cupmStream_t;
771438e86SJunchao Zhang typedef cudaEvent_t  cupmEvent_t;
87fd2d3dbSJunchao Zhang #endif
97fd2d3dbSJunchao Zhang 
107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP)
110e6b6b59SJacob Faibussowitsch   #include <petscdevice_hip.h>
1271438e86SJunchao Zhang typedef hipStream_t cupmStream_t;
1371438e86SJunchao Zhang typedef hipEvent_t  cupmEvent_t;
147fd2d3dbSJunchao Zhang #endif
15cd620004SJunchao Zhang 
16874d28e3SJunchao Zhang /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to.
17da81f932SPierre Jolivet    We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int.
18874d28e3SJunchao Zhang  */
19874d28e3SJunchao Zhang typedef long long int          llint;
20874d28e3SJunchao Zhang typedef unsigned long long int ullint;
21874d28e3SJunchao Zhang 
22cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
239371c9d4SSatish Balay typedef enum {
249371c9d4SSatish Balay   PETSCSF_LOCAL = 0,
259371c9d4SSatish Balay   PETSCSF_REMOTE
269371c9d4SSatish Balay } PetscSFScope;
2740e23c03SJunchao Zhang 
28fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
2940e23c03SJunchao Zhang 
30fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
31fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
3240e23c03SJunchao Zhang 
33fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
34fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
35fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
36cd620004SJunchao Zhang 
37fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
38fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
39fcc7397dSJunchao Zhang 
40fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
41fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
4240e23c03SJunchao Zhang  */
4340e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
44fcc7397dSJunchao Zhang   PetscInt *array;        /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
45b23bfdefSJunchao Zhang   PetscInt  n;            /* Number of destination ranks */
46fcc7397dSJunchao Zhang   PetscInt *offset;       /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
47fcc7397dSJunchao Zhang   PetscInt *start;        /* [n] First index */
48fcc7397dSJunchao Zhang   PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
49fcc7397dSJunchao Zhang   PetscInt *X, *Y;        /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
5040e23c03SJunchao Zhang };
5140e23c03SJunchao Zhang 
52eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
5340e23c03SJunchao Zhang  */
54fcc7397dSJunchao Zhang struct _n_PetscSFLink {
5571438e86SJunchao Zhang   PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */
5671438e86SJunchao Zhang   PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection);
5771438e86SJunchao Zhang   PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection);
58f5d27ee7SJunchao Zhang   PetscErrorCode (*InitMPIRequests)(PetscSF, PetscSFLink, PetscSFDirection); // init (persistent) MPI requests
5971438e86SJunchao Zhang   PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
6071438e86SJunchao Zhang   PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection);
6171438e86SJunchao Zhang   PetscErrorCode (*SyncDevice)(PetscSFLink);
6271438e86SJunchao Zhang   PetscErrorCode (*SyncStream)(PetscSFLink);
6371438e86SJunchao Zhang   PetscErrorCode (*Destroy)(PetscSF, PetscSFLink);
6471438e86SJunchao Zhang 
6571438e86SJunchao Zhang   PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection);
6671438e86SJunchao Zhang   PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection);
6720c24465SJunchao Zhang 
68fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
69fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
70fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
71fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
72fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
73fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
74fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
75fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
76fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
77fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
78fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
79fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
80fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
81fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
82fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
83fcc7397dSJunchao Zhang 
84fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
85fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
86fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
87fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
88fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
89fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
90fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
91fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
92fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
93fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
94fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
95fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
96fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
97fcc7397dSJunchao Zhang 
98fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
99cd620004SJunchao Zhang 
100cd620004SJunchao Zhang   PetscBool deviceinited; /* Are device related fields initialized? */
1017fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
102eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
1037fd2d3dbSJunchao Zhang      will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances.
104eb02082bSJunchao Zhang   */
105fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
106fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
107fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
108fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
109fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
110fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
111fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
112fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
113fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
114fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
115fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
116fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
117fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
118fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
119fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
120eb02082bSJunchao Zhang 
121fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
122fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
123fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
124fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
125fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
126fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
127fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
128fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
129fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
130fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
131fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
132fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
133fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
134fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
135eb02082bSJunchao Zhang 
136eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
137fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
138fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
139fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
140fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
141fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
142fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
143fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
144fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
145fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
146fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
147fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
148fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
149fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *);
150fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *);
151cd620004SJunchao Zhang 
152fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
153fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
154fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
155fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
156fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
157fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
158fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
159fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
160fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
161fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
162fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
163fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
164fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *);
165fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *);
16671438e86SJunchao Zhang   #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP)
167e315309dSJunchao Zhang   PetscInt     maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */
16871438e86SJunchao Zhang   cupmStream_t stream;                   /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */
169eb02082bSJunchao Zhang   #endif
1707fd2d3dbSJunchao Zhang #endif
171eb02082bSJunchao Zhang   PetscMPIInt  tag;                  /* Each link has a tag so we can perform multiple SF ops at the same time */
172cd620004SJunchao Zhang   MPI_Datatype unit;                 /* The MPI datatype this PetscSFLink is built for */
173eb02082bSJunchao Zhang   MPI_Datatype basicunit;            /* unit is made of MPI builtin dataype basicunit */
174e07844bfSJunchao Zhang   PetscBool    isbuiltin;            /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
175eb02082bSJunchao Zhang   size_t       unitbytes;            /* Number of bytes in a unit */
176eb02082bSJunchao Zhang   PetscInt     bs;                   /* Number of basic units in a unit */
177cd620004SJunchao Zhang   const void  *rootdata, *leafdata;  /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
178cd620004SJunchao Zhang   PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */
179cd620004SJunchao Zhang 
180cd620004SJunchao Zhang   /* For local and remote communication */
181cd620004SJunchao Zhang   PetscMemType rootmtype_mpi, leafmtype_mpi;   /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
182cd620004SJunchao Zhang   PetscBool    rootdirect[2], leafdirect[2];   /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
183cd620004SJunchao Zhang   PetscInt     rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
184cd620004SJunchao Zhang   const void  *rootdatadirect[2][2];           /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
185cd620004SJunchao Zhang   const void  *leafdatadirect[2][2];           /* ... We need them to look up links when root/leafdirect_mpi are true */
18671438e86SJunchao Zhang   char        *rootbuf[2][2];                  /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */
18771438e86SJunchao Zhang                                                /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */
188cd620004SJunchao Zhang   char        *rootbuf_alloc[2][2];            /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
189cd620004SJunchao Zhang   char        *leafbuf[2][2];                  /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
190cd620004SJunchao Zhang   char        *leafbuf_alloc[2][2];
191cd620004SJunchao Zhang   MPI_Request *rootreqs[2][2][2];       /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
192cd620004SJunchao Zhang   MPI_Request *leafreqs[2][2][2];       /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
193cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
194cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
195cd620004SJunchao Zhang   MPI_Request *reqs;                    /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
196cd620004SJunchao Zhang   PetscSFLink  next;
19771438e86SJunchao Zhang 
19871438e86SJunchao Zhang   PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */
19971438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
20071438e86SJunchao Zhang   cupmEvent_t  dataReady;        /* Events to mark readiness of root/leafdata */
20171438e86SJunchao Zhang   cupmEvent_t  endRemoteComm;    /* Events to mark end of local/remote communication */
20271438e86SJunchao Zhang   cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */
20371438e86SJunchao Zhang 
20471438e86SJunchao Zhang   /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */
20571438e86SJunchao Zhang   uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */
20671438e86SJunchao Zhang   uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */
20771438e86SJunchao Zhang #endif
20840e23c03SJunchao Zhang };
20940e23c03SJunchao Zhang 
210cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *);
211b7c0d12aSJunchao Zhang 
212cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
213cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
214cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype);
215cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *);
216cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *);
21771438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink);
218cd620004SJunchao Zhang 
219cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (** Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt *,const void *,void *))220d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *))
221d71ae5a4SJacob Faibussowitsch {
222eb02082bSJunchao Zhang   PetscFunctionBegin;
22371438e86SJunchao Zhang   if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack;
2247fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE)
225cd620004SJunchao Zhang   else *Pack = link->d_Pack;
226eb02082bSJunchao Zhang #endif
2273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
228eb02082bSJunchao Zhang }
2297fd2d3dbSJunchao Zhang 
230fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *));
231fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *));
232fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *));
233fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *));
234b7c0d12aSJunchao Zhang 
235cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
236cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *);
237cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *);
238cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
239cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op);
24071438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op);
241cd620004SJunchao Zhang 
24271438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op);
243cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op);
244cd620004SJunchao Zhang 
2457fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF);
2467fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF);
24771438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
2487fd2d3dbSJunchao Zhang 
24920c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
25071438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype);
25120c24465SJunchao Zhang #endif
25220c24465SJunchao Zhang 
25359af0bd3SScott Kruger #if defined(PETSC_HAVE_HIP)
25471438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype);
25559af0bd3SScott Kruger #endif
25659af0bd3SScott Kruger 
25720c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS)
25820c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype);
25920c24465SJunchao Zhang #endif
26020c24465SJunchao Zhang 
26171438e86SJunchao Zhang #if defined(PETSC_HAVE_NVSHMEM)
26271438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *);
26371438e86SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *);
26471438e86SJunchao Zhang #endif
26571438e86SJunchao Zhang 
PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void ** rootbuf,void ** leafbuf,MPI_Request ** rootreqs,MPI_Request ** leafreqs)266f5d27ee7SJunchao Zhang static inline PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs)
267f5d27ee7SJunchao Zhang {
268f5d27ee7SJunchao Zhang   const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* memtype of buffers passed to MPI */
269f5d27ee7SJunchao Zhang   const PetscInt     rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
270f5d27ee7SJunchao Zhang 
271f5d27ee7SJunchao Zhang   PetscFunctionBegin;
272f5d27ee7SJunchao Zhang   if (link->InitMPIRequests) PetscCall((*link->InitMPIRequests)(sf, link, direction)); // init (persistent) MPI requests
273f5d27ee7SJunchao Zhang 
274f5d27ee7SJunchao Zhang   if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
275f5d27ee7SJunchao Zhang   if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
276f5d27ee7SJunchao Zhang   if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
277f5d27ee7SJunchao Zhang   if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
278f5d27ee7SJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
279f5d27ee7SJunchao Zhang }
280f5d27ee7SJunchao Zhang 
PetscSFLinkStartCommunication(PetscSF sf,PetscSFLink link,PetscSFDirection direction)281d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
282d71ae5a4SJacob Faibussowitsch {
28371438e86SJunchao Zhang   PetscFunctionBegin;
2849566063dSJacob Faibussowitsch   if (link->StartCommunication) PetscCall((*link->StartCommunication)(sf, link, direction));
2853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28671438e86SJunchao Zhang }
28771438e86SJunchao Zhang 
PetscSFLinkFinishCommunication(PetscSF sf,PetscSFLink link,PetscSFDirection direction)288d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
289d71ae5a4SJacob Faibussowitsch {
29071438e86SJunchao Zhang   PetscFunctionBegin;
2919566063dSJacob Faibussowitsch   if (link->FinishCommunication) PetscCall((*link->FinishCommunication)(sf, link, direction));
2923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
29371438e86SJunchao Zhang }
29471438e86SJunchao Zhang 
2957fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */
2964d9d436bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL)
2977fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
2987fd2d3dbSJunchao Zhang    to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
2997fd2d3dbSJunchao Zhang */
PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)300d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
301d71ae5a4SJacob Faibussowitsch {
3027fd2d3dbSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
3037fd2d3dbSJunchao Zhang 
3047fd2d3dbSJunchao Zhang   PetscFunctionBegin;
30571438e86SJunchao Zhang   /* rootdata is on device but we use regular MPI for communication */
30671438e86SJunchao Zhang   if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) {
3077fd2d3dbSJunchao Zhang     void  *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3087fd2d3dbSJunchao Zhang     void  *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3097fd2d3dbSJunchao Zhang     size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes;
3107fd2d3dbSJunchao Zhang     if (device2host) {
3119566063dSJacob Faibussowitsch       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
3129566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(count));
3137fd2d3dbSJunchao Zhang     } else {
3149566063dSJacob Faibussowitsch       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
3159566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(count));
3167fd2d3dbSJunchao Zhang     }
3177fd2d3dbSJunchao Zhang   }
3183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3197fd2d3dbSJunchao Zhang }
3207fd2d3dbSJunchao Zhang 
PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)321d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host)
322d71ae5a4SJacob Faibussowitsch {
3237fd2d3dbSJunchao Zhang   PetscFunctionBegin;
32471438e86SJunchao Zhang   if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) {
3257fd2d3dbSJunchao Zhang     void  *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
3267fd2d3dbSJunchao Zhang     void  *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
3277fd2d3dbSJunchao Zhang     size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes;
3287fd2d3dbSJunchao Zhang     if (device2host) {
3299566063dSJacob Faibussowitsch       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count));
3309566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuToCpu(count));
3317fd2d3dbSJunchao Zhang     } else {
3329566063dSJacob Faibussowitsch       PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count));
3339566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(count));
3347fd2d3dbSJunchao Zhang     }
3357fd2d3dbSJunchao Zhang   }
3363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3377fd2d3dbSJunchao Zhang }
3387fd2d3dbSJunchao Zhang 
33971438e86SJunchao Zhang /* Make sure root/leafbuf for the remote is ready for MPI */
PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf,PetscSFLink link)340*646b835dSJunchao Zhang static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link)
341d71ae5a4SJacob Faibussowitsch {
342*646b835dSJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
34371438e86SJunchao Zhang 
34471438e86SJunchao Zhang   PetscFunctionBegin;
345*646b835dSJunchao Zhang   // Make sendbuf ready to read, recvbuf ready to write (other previous operations on recvbuf might finish after MPI_Waitall() if they use different streams)
346*646b835dSJunchao Zhang   if ((PetscMemTypeDevice(link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) || (PetscMemTypeDevice(link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE])) PetscCall((*link->SyncStream)(link));
3473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34871438e86SJunchao Zhang }
3497fd2d3dbSJunchao Zhang #else /* Host only */
3503ba16761SJacob Faibussowitsch   #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
3513ba16761SJacob Faibussowitsch   #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS
352*646b835dSJunchao Zhang   #define PetscSFLinkSyncStreamBeforeCallMPI(a, b)                  PETSC_SUCCESS
3537fd2d3dbSJunchao Zhang #endif
354cd620004SJunchao Zhang 
355cd620004SJunchao Zhang /* Get root indices used for pack/unpack
356cd620004SJunchao Zhang 
357cd620004SJunchao Zhang Input arguments:
358cd620004SJunchao Zhang   +sf    - StarForest
359cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
360cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
36171438e86SJunchao Zhang   -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
362cd620004SJunchao Zhang 
363cd620004SJunchao Zhang  Output arguments:
364cd620004SJunchao Zhang   +count   - Count of indices
365cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
36671438e86SJunchao Zhang   .opt     - Packing optimizations
36771438e86SJunchao Zhang   -indices - Indices of roots for pack/unpack. NULL means indices are contiguous
368cd620004SJunchao Zhang  */
PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt * count,PetscInt * start,PetscSFPackOpt * opt,const PetscInt ** indices)369d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
370d71ae5a4SJacob Faibussowitsch {
371cd620004SJunchao Zhang   PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
372cd620004SJunchao Zhang   PetscInt       offset;
373b7c0d12aSJunchao Zhang 
374b7c0d12aSJunchao Zhang   PetscFunctionBegin;
375fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
376fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
377fcc7397dSJunchao Zhang   *opt     = NULL;
378fcc7397dSJunchao Zhang   *indices = NULL;
379fcc7397dSJunchao Zhang 
380fcc7397dSJunchao Zhang   /* We have these rules:
381fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
382fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
383fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
384fcc7397dSJunchao Zhang   */
385fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
386cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks];
3879371c9d4SSatish Balay     if (PetscMemTypeHost(mtype)) {
3889371c9d4SSatish Balay       *opt     = bas->rootpackopt[scope];
3899371c9d4SSatish Balay       *indices = bas->irootloc + offset;
3909371c9d4SSatish Balay     } else {
391fcc7397dSJunchao Zhang       size_t size;
392fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
393fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
3949566063dSJacob Faibussowitsch           PetscCall(PetscMalloc1(1, &bas->rootpackopt_d[scope]));
3959566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1)); /* Make pointers in bas->rootpackopt_d[] still work on host */
396fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                  /* See comments at struct _n_PetscSFPackOpt*/
3979566063dSJacob Faibussowitsch           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array));
3989566063dSJacob Faibussowitsch           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size));
399fcc7397dSJunchao Zhang         }
400fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
401fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
402fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
403fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope] * sizeof(PetscInt);
4049566063dSJacob Faibussowitsch           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope]));
4059566063dSJacob Faibussowitsch           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size));
406b7c0d12aSJunchao Zhang         }
407cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
408cd620004SJunchao Zhang       }
409cd620004SJunchao Zhang     }
410cd620004SJunchao Zhang   }
4113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
412b7c0d12aSJunchao Zhang }
413b7c0d12aSJunchao Zhang 
414cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
415cd620004SJunchao Zhang 
416fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
417cd620004SJunchao Zhang  */
PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt * count,PetscInt * start,PetscSFPackOpt * opt,const PetscInt ** indices)418d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices)
419d71ae5a4SJacob Faibussowitsch {
420cd620004SJunchao Zhang   PetscInt offset;
421cd620004SJunchao Zhang 
422cd620004SJunchao Zhang   PetscFunctionBegin;
423fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
424fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
425fcc7397dSJunchao Zhang   *opt     = NULL;
426fcc7397dSJunchao Zhang   *indices = NULL;
427fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
428cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks];
4299371c9d4SSatish Balay     if (PetscMemTypeHost(mtype)) {
4309371c9d4SSatish Balay       *opt     = sf->leafpackopt[scope];
4319371c9d4SSatish Balay       *indices = sf->rmine + offset;
4329371c9d4SSatish Balay     } else {
433fcc7397dSJunchao Zhang       size_t size;
434fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
435fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
4369566063dSJacob Faibussowitsch           PetscCall(PetscMalloc1(1, &sf->leafpackopt_d[scope]));
4379566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1));
438fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt);                                       /* See comments at struct _n_PetscSFPackOpt*/
4399566063dSJacob Faibussowitsch           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array)); /* Change ->array to a device pointer */
4409566063dSJacob Faibussowitsch           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size));
441fcc7397dSJunchao Zhang         }
442fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
443fcc7397dSJunchao Zhang       } else {
444fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
445fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope] * sizeof(PetscInt);
4469566063dSJacob Faibussowitsch           PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope]));
4479566063dSJacob Faibussowitsch           PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size));
448cd620004SJunchao Zhang         }
449cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
450cd620004SJunchao Zhang       }
451cd620004SJunchao Zhang     }
452cd620004SJunchao Zhang   }
4533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
454cd620004SJunchao Zhang }
455