xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision f01131f0670950e9d98c29f16b982bc6d3023227)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5cd620004SJunchao Zhang 
6cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
7cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
840e23c03SJunchao Zhang 
9b23bfdefSJunchao Zhang /* Optimization plans in packing & unpacking for destination ranks.
1040e23c03SJunchao Zhang 
11b23bfdefSJunchao Zhang   Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing:
12b23bfdefSJunchao Zhang      p[i] = u[idx[i]], for i in [0,count)
1340e23c03SJunchao Zhang 
14b23bfdefSJunchao Zhang   Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[].
15b23bfdefSJunchao Zhang   We analyze indices for each rank and see if they are patterns that can be used to optimize the packing.
16eb02082bSJunchao Zhang   The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into
17eb02082bSJunchao Zhang   a small number of contiguous memory copies or one strided memory copy.
18cd620004SJunchao Zhang 
19cd620004SJunchao Zhang   Note before using this per-rank optimization, one should check rleafloccontig, irootloccontig, which say
20cd620004SJunchao Zhang   indices in whole start from 0 and are contiguous, and therefore much useful when true.
2140e23c03SJunchao Zhang  */
22b23bfdefSJunchao Zhang typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType;
2340e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
24b23bfdefSJunchao Zhang   PetscInt           n;             /* Number of destination ranks */
25b23bfdefSJunchao Zhang   PetscSFPackOptType *type;         /* [n] Optimization types for the n ranks */
26b23bfdefSJunchao Zhang   PetscInt           *offset;       /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */
27b23bfdefSJunchao Zhang   PetscInt           *copy_offset;  /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */
28b23bfdefSJunchao Zhang   PetscInt           *copy_start;   /* [*]     j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */
29eb02082bSJunchao Zhang   PetscInt           *copy_length;  /* [*]     starting at idx[copy_start[j]] */
30b23bfdefSJunchao Zhang   PetscInt           *stride_step;  /* [n]   If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */
3140e23c03SJunchao Zhang   PetscInt           *stride_n;     /* [n]     and total stride_n[i] steps */
3240e23c03SJunchao Zhang };
3340e23c03SJunchao Zhang 
34eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
3540e23c03SJunchao Zhang  */
36cd620004SJunchao Zhang struct _n_PetscSFLink {                /* link,    count,  start,   indices,      pack plan, root/leafdata, buffer*/
37cd620004SJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
38cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
39cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
40cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
41cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
42cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
43cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
44cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
45cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
46cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
47cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
48cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
49cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
50cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
51cd620004SJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
52cd620004SJunchao Zhang                                       /*     link,   count, startx,            idx,      xdata,  starty,         idy,   ydata */
53cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
54cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
55cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
56cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
57cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
58cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
59cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
60cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
61cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
62cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
63cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
64cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
65cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
66cd620004SJunchao Zhang                                        /* link, count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate*/
67cd620004SJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
68cd620004SJunchao Zhang 
69cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
70eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
71eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
72eb02082bSJunchao Zhang      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
73eb02082bSJunchao Zhang      regular ops when there are no data race chances.
74eb02082bSJunchao Zhang   */
75cd620004SJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
76eb02082bSJunchao Zhang 
77cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
78cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
79cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
80cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
81cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
82cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
83cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
84cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
85cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
86cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
87cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
88cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
89cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
90cd620004SJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
91cd620004SJunchao Zhang 
92cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
93cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
94cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
95cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
96cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
97cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
98cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
99cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
100cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
101cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
102cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
103cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
104cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
105cd620004SJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
106eb02082bSJunchao Zhang 
107eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
108cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
109cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
110cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
111cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
112cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
113cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
114cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
115cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
116cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
117cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
118cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
119cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
120cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
121cd620004SJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
122cd620004SJunchao Zhang 
123cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
124cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
125cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
126cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
127cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
128cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
129cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
130cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
131cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
132cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
133cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
134cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
135cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
136cd620004SJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
137eb02082bSJunchao Zhang 
138e315309dSJunchao Zhang   PetscInt     maxResidentThreadsPerGPU;     /* It is a copy from SF for convenience */
139eb02082bSJunchao Zhang   cudaStream_t stream;                       /* Stream to launch pack/unapck kernels if not using the default stream */
140eb02082bSJunchao Zhang #endif
141eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
142cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
143eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
144e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
145eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
146eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
147cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
148cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
149cd620004SJunchao Zhang 
150cd620004SJunchao Zhang   /* For local and remote communication */
151cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
152cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
153cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
154cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
155cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
156cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
157cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
158cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
159cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
160cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
161cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
162cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
163cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
164cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
165cd620004SJunchao Zhang   PetscSFLink  next;
16640e23c03SJunchao Zhang };
16740e23c03SJunchao Zhang 
168cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
169cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
170cd620004SJunchao Zhang #else
171cd620004SJunchao Zhang #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0
172cd620004SJunchao Zhang #endif
173b7c0d12aSJunchao Zhang 
174cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
175cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
176cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
177cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
178cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
179cd620004SJunchao Zhang #else
180cd620004SJunchao Zhang #define PetscSFLinkSetUp_Device(a,b,c) 0
181cd620004SJunchao Zhang #endif
182cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
183cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
184cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
185cd620004SJunchao Zhang 
186cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
187cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*))
188eb02082bSJunchao Zhang {
189eb02082bSJunchao Zhang   PetscFunctionBegin;
190eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
191eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
192cd620004SJunchao Zhang   else *Pack = link->d_Pack;
193eb02082bSJunchao Zhang #endif
194eb02082bSJunchao Zhang   PetscFunctionReturn(0);
195eb02082bSJunchao Zhang }
196cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*));
197cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*));
198cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*));
199cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*));
200cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
201b7c0d12aSJunchao Zhang 
202cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
203cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
204cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
205cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
206cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
207cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
208cd620004SJunchao Zhang 
209cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
210cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
211cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
212cd620004SJunchao Zhang 
213cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf);
214cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf);
215cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreatePackOpt(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*);
216cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out);
217cd620004SJunchao Zhang 
218cd620004SJunchao Zhang 
219cd620004SJunchao Zhang /* Get root indices used for pack/unpack
220cd620004SJunchao Zhang 
221cd620004SJunchao Zhang Input arguments:
222cd620004SJunchao Zhang   +sf    - StarForest
223cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
224cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
225cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
226cd620004SJunchao Zhang 
227cd620004SJunchao Zhang  Output arguments:
228cd620004SJunchao Zhang   +count   - Count of indices
229cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
230cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
231cd620004SJunchao Zhang  */
232cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
233b7c0d12aSJunchao Zhang {
234cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
235cd620004SJunchao Zhang   PetscInt       offset;
236b7c0d12aSJunchao Zhang 
237b7c0d12aSJunchao Zhang   PetscFunctionBegin;
238cd620004SJunchao Zhang   if (count) *count = bas->rootbuflen[scope];
239cd620004SJunchao Zhang   if (start) *start = bas->rootstart[scope];
240cd620004SJunchao Zhang   if (indices) {
241cd620004SJunchao Zhang     if (bas->rootcontig[scope]) *indices = NULL;
242cd620004SJunchao Zhang     else {
243cd620004SJunchao Zhang       offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
244cd620004SJunchao Zhang       if (mtype == PETSC_MEMTYPE_HOST) {*indices = bas->irootloc + offset;}
245cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
246cd620004SJunchao Zhang       else {
247cd620004SJunchao Zhang         if (!bas->irootloc_d[scope]) {
248cd620004SJunchao Zhang           cudaError_t    cerr;
249cd620004SJunchao Zhang           size_t         size = bas->rootbuflen[scope]*sizeof(PetscInt);
250cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr);
251cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
252b7c0d12aSJunchao Zhang         }
253cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
254cd620004SJunchao Zhang       }
255cd620004SJunchao Zhang #endif
256cd620004SJunchao Zhang     }
257cd620004SJunchao Zhang   }
258b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
259b7c0d12aSJunchao Zhang }
260b7c0d12aSJunchao Zhang 
261cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
262cd620004SJunchao Zhang 
263cd620004SJunchao Zhang   See also PetscSFLinkGetRootIndices()
264cd620004SJunchao Zhang  */
265cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
266cd620004SJunchao Zhang {
267cd620004SJunchao Zhang   PetscInt   offset;
268cd620004SJunchao Zhang 
269cd620004SJunchao Zhang   PetscFunctionBegin;
270cd620004SJunchao Zhang   if (count) *count = sf->leafbuflen[scope];
271cd620004SJunchao Zhang   if (start) *start = sf->leafstart[scope];
272cd620004SJunchao Zhang   if (indices) {
273cd620004SJunchao Zhang     if (sf->leafcontig[scope]) *indices = NULL;
274cd620004SJunchao Zhang     else {
275cd620004SJunchao Zhang       offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
276cd620004SJunchao Zhang       if (mtype == PETSC_MEMTYPE_HOST) {*indices = sf->rmine + offset;}
277eb02082bSJunchao Zhang   #if defined(PETSC_HAVE_CUDA)
278cd620004SJunchao Zhang       else {
279cd620004SJunchao Zhang         if (!sf->rmine_d[scope]) {
280cd620004SJunchao Zhang           cudaError_t  cerr;
281cd620004SJunchao Zhang           size_t       size = sf->leafbuflen[scope]*sizeof(PetscInt);
282cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr);
283cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
284cd620004SJunchao Zhang         }
285cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
286cd620004SJunchao Zhang       }
287eb02082bSJunchao Zhang   #endif
288cd620004SJunchao Zhang     }
289cd620004SJunchao Zhang   }
290cd620004SJunchao Zhang   PetscFunctionReturn(0);
291cd620004SJunchao Zhang }
292cd620004SJunchao Zhang 
293cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
294cd620004SJunchao Zhang {
295cd620004SJunchao Zhang   PetscErrorCode       ierr;
296cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
297cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
298cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
299cd620004SJunchao Zhang 
300cd620004SJunchao Zhang   PetscFunctionBegin;
301cd620004SJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
302cd620004SJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
303cd620004SJunchao Zhang   PetscFunctionReturn(0);
304cd620004SJunchao Zhang }
305cd620004SJunchao Zhang 
306*f01131f0SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
307*f01131f0SJunchao Zhang {
308*f01131f0SJunchao Zhang   PetscFunctionBegin;
309*f01131f0SJunchao Zhang   if (n) {
310*f01131f0SJunchao Zhang     if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
311*f01131f0SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
312*f01131f0SJunchao Zhang     else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_HOST)   {
313*f01131f0SJunchao Zhang       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
314*f01131f0SJunchao Zhang       PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr);
315*f01131f0SJunchao Zhang     } else if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_DEVICE) {
316*f01131f0SJunchao Zhang       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);
317*f01131f0SJunchao Zhang       PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr);
318*f01131f0SJunchao Zhang     } else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(err);}
319*f01131f0SJunchao Zhang #endif
320*f01131f0SJunchao Zhang     else SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType for dst %d and src %d",(int)dstmtype,(int)srcmtype);
321*f01131f0SJunchao Zhang   }
322*f01131f0SJunchao Zhang   PetscFunctionReturn(0);
323*f01131f0SJunchao Zhang }
32440e23c03SJunchao Zhang #endif
325