xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision cd62000487352d7c447823794a883667dfb859d7)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4*cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5*cd620004SJunchao Zhang 
6*cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
7*cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
840e23c03SJunchao Zhang 
9b23bfdefSJunchao Zhang /* Optimization plans in packing & unpacking for destination ranks.
1040e23c03SJunchao Zhang 
11b23bfdefSJunchao Zhang   Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing:
12b23bfdefSJunchao Zhang      p[i] = u[idx[i]], for i in [0,count)
1340e23c03SJunchao Zhang 
14b23bfdefSJunchao Zhang   Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[].
15b23bfdefSJunchao Zhang   We analyze indices for each rank and see if they are patterns that can be used to optimize the packing.
16eb02082bSJunchao Zhang   The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into
17eb02082bSJunchao Zhang   a small number of contiguous memory copies or one strided memory copy.
18*cd620004SJunchao Zhang 
19*cd620004SJunchao Zhang   Note before using this per-rank optimization, one should check rleafloccontig, irootloccontig, which say
20*cd620004SJunchao Zhang   indices in whole start from 0 and are contiguous, and therefore much useful when true.
2140e23c03SJunchao Zhang  */
22b23bfdefSJunchao Zhang typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType;
2340e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
24b23bfdefSJunchao Zhang   PetscInt           n;             /* Number of destination ranks */
25b23bfdefSJunchao Zhang   PetscSFPackOptType *type;         /* [n] Optimization types for the n ranks */
26b23bfdefSJunchao Zhang   PetscInt           *offset;       /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */
27b23bfdefSJunchao Zhang   PetscInt           *copy_offset;  /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */
28b23bfdefSJunchao Zhang   PetscInt           *copy_start;   /* [*]     j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */
29eb02082bSJunchao Zhang   PetscInt           *copy_length;  /* [*]     starting at idx[copy_start[j]] */
30b23bfdefSJunchao Zhang   PetscInt           *stride_step;  /* [n]   If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */
3140e23c03SJunchao Zhang   PetscInt           *stride_n;     /* [n]     and total stride_n[i] steps */
3240e23c03SJunchao Zhang };
3340e23c03SJunchao Zhang 
34eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
3540e23c03SJunchao Zhang  */
36*cd620004SJunchao Zhang struct _n_PetscSFLink {                /* link,    count,  start,   indices,      pack plan, root/leafdata, buffer*/
37*cd620004SJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
38*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
39*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
40*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
41*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
42*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
43*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
44*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
45*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
46*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
47*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
48*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
49*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
50*cd620004SJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
51*cd620004SJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
52*cd620004SJunchao Zhang                                       /*     link,   count, startx,            idx,      xdata,  starty,         idy,   ydata */
53*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
54*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
55*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
56*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
57*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
58*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
59*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
60*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
61*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
62*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
63*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
64*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
65*cd620004SJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
66*cd620004SJunchao Zhang                                        /* link, count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate*/
67*cd620004SJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
68*cd620004SJunchao Zhang 
69*cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
70eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
71eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
72eb02082bSJunchao Zhang      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
73eb02082bSJunchao Zhang      regular ops when there are no data race chances.
74eb02082bSJunchao Zhang   */
75*cd620004SJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*);
76eb02082bSJunchao Zhang 
77*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
78*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
79*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
80*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
81*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
82*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
83*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
84*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
85*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
86*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
87*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
88*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
89*cd620004SJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
90*cd620004SJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
91*cd620004SJunchao Zhang 
92*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
93*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
94*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
95*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
96*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
97*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
98*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
99*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
100*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
101*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
102*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
103*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
104*cd620004SJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
105*cd620004SJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
106eb02082bSJunchao Zhang 
107eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
108*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
109*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
110*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
111*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
112*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
113*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
114*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
115*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
116*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
117*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
118*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
119*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
120*cd620004SJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*);
121*cd620004SJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,      void*);
122*cd620004SJunchao Zhang 
123*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
124*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
125*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
126*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
127*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
128*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
129*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
130*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
131*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
132*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
133*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
134*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
135*cd620004SJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*);
136*cd620004SJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*);
137eb02082bSJunchao Zhang 
138e315309dSJunchao Zhang   PetscInt     maxResidentThreadsPerGPU;     /* It is a copy from SF for convenience */
139eb02082bSJunchao Zhang   cudaStream_t stream;                       /* Stream to launch pack/unapck kernels if not using the default stream */
140eb02082bSJunchao Zhang #endif
141eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
142*cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
143eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
144e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
145eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
146eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
147*cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
148*cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
149*cd620004SJunchao Zhang 
150*cd620004SJunchao Zhang   /* For local and remote communication */
151*cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
152*cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
153*cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
154*cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
155*cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
156*cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
157*cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
158*cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
159*cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
160*cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
161*cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
162*cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
163*cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
164*cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
165*cd620004SJunchao Zhang   PetscSFLink  next;
16640e23c03SJunchao Zhang };
16740e23c03SJunchao Zhang 
168*cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
169*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
170*cd620004SJunchao Zhang #else
171*cd620004SJunchao Zhang #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0
172*cd620004SJunchao Zhang #endif
173b7c0d12aSJunchao Zhang 
174*cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
175*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
176*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
177*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
178*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
179*cd620004SJunchao Zhang #else
180*cd620004SJunchao Zhang #define PetscSFLinkSetUp_Device(a,b,c) 0
181*cd620004SJunchao Zhang #endif
182*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
183*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
184*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
185*cd620004SJunchao Zhang 
186*cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
187*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*))
188eb02082bSJunchao Zhang {
189eb02082bSJunchao Zhang   PetscFunctionBegin;
190eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
191eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
192*cd620004SJunchao Zhang   else *Pack = link->d_Pack;
193eb02082bSJunchao Zhang #endif
194eb02082bSJunchao Zhang   PetscFunctionReturn(0);
195eb02082bSJunchao Zhang }
196*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*));
197*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*));
198*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*));
199*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*));
200*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
201b7c0d12aSJunchao Zhang 
202*cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
203*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
204*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
205*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
206*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
207*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
208*cd620004SJunchao Zhang 
209*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
210*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
211*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
212*cd620004SJunchao Zhang 
213*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf);
214*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf);
215*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreatePackOpt(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*);
216*cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out);
217*cd620004SJunchao Zhang 
218*cd620004SJunchao Zhang 
219*cd620004SJunchao Zhang /* Get root indices used for pack/unpack
220*cd620004SJunchao Zhang 
221*cd620004SJunchao Zhang Input arguments:
222*cd620004SJunchao Zhang   +sf    - StarForest
223*cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
224*cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
225*cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
226*cd620004SJunchao Zhang 
227*cd620004SJunchao Zhang  Output arguments:
228*cd620004SJunchao Zhang   +count   - Count of indices
229*cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
230*cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
231*cd620004SJunchao Zhang  */
232*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
233b7c0d12aSJunchao Zhang {
234*cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
235*cd620004SJunchao Zhang   PetscInt       offset;
236b7c0d12aSJunchao Zhang 
237b7c0d12aSJunchao Zhang   PetscFunctionBegin;
238*cd620004SJunchao Zhang   if (count) *count = bas->rootbuflen[scope];
239*cd620004SJunchao Zhang   if (start) *start = bas->rootstart[scope];
240*cd620004SJunchao Zhang   if (indices) {
241*cd620004SJunchao Zhang     if (bas->rootcontig[scope]) *indices = NULL;
242*cd620004SJunchao Zhang     else {
243*cd620004SJunchao Zhang       offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
244*cd620004SJunchao Zhang       if (mtype == PETSC_MEMTYPE_HOST) {*indices = bas->irootloc + offset;}
245*cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
246*cd620004SJunchao Zhang       else {
247*cd620004SJunchao Zhang         if (!bas->irootloc_d[scope]) {
248*cd620004SJunchao Zhang           cudaError_t    cerr;
249*cd620004SJunchao Zhang           size_t         size = bas->rootbuflen[scope]*sizeof(PetscInt);
250*cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr);
251*cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
252b7c0d12aSJunchao Zhang         }
253*cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
254*cd620004SJunchao Zhang       }
255*cd620004SJunchao Zhang #endif
256*cd620004SJunchao Zhang     }
257*cd620004SJunchao Zhang   }
258b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
259b7c0d12aSJunchao Zhang }
260b7c0d12aSJunchao Zhang 
261*cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
262*cd620004SJunchao Zhang 
263*cd620004SJunchao Zhang   See also PetscSFLinkGetRootIndices()
264*cd620004SJunchao Zhang  */
265*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices)
266*cd620004SJunchao Zhang {
267*cd620004SJunchao Zhang   PetscInt   offset;
268*cd620004SJunchao Zhang 
269*cd620004SJunchao Zhang   PetscFunctionBegin;
270*cd620004SJunchao Zhang   if (count) *count = sf->leafbuflen[scope];
271*cd620004SJunchao Zhang   if (start) *start = sf->leafstart[scope];
272*cd620004SJunchao Zhang   if (indices) {
273*cd620004SJunchao Zhang     if (sf->leafcontig[scope]) *indices = NULL;
274*cd620004SJunchao Zhang     else {
275*cd620004SJunchao Zhang       offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
276*cd620004SJunchao Zhang       if (mtype == PETSC_MEMTYPE_HOST) {*indices = sf->rmine + offset;}
277eb02082bSJunchao Zhang   #if defined(PETSC_HAVE_CUDA)
278*cd620004SJunchao Zhang       else {
279*cd620004SJunchao Zhang         if (!sf->rmine_d[scope]) {
280*cd620004SJunchao Zhang           cudaError_t  cerr;
281*cd620004SJunchao Zhang           size_t       size = sf->leafbuflen[scope]*sizeof(PetscInt);
282*cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr);
283*cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
284*cd620004SJunchao Zhang         }
285*cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
286*cd620004SJunchao Zhang       }
287eb02082bSJunchao Zhang   #endif
288*cd620004SJunchao Zhang     }
289*cd620004SJunchao Zhang   }
290*cd620004SJunchao Zhang   PetscFunctionReturn(0);
291*cd620004SJunchao Zhang }
292*cd620004SJunchao Zhang 
293*cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
294*cd620004SJunchao Zhang {
295*cd620004SJunchao Zhang   PetscErrorCode       ierr;
296*cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
297*cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
298*cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
299*cd620004SJunchao Zhang 
300*cd620004SJunchao Zhang   PetscFunctionBegin;
301*cd620004SJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
302*cd620004SJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
303*cd620004SJunchao Zhang   PetscFunctionReturn(0);
304*cd620004SJunchao Zhang }
305*cd620004SJunchao Zhang 
30640e23c03SJunchao Zhang #endif
307