xref: /petsc/src/vec/is/sf/impls/basic/sfpack.h (revision fcc7397d86458e87bf2d3842cdf889c011d18f65)
140e23c03SJunchao Zhang #if !defined(__SFPACK_H)
240e23c03SJunchao Zhang #define __SFPACK_H
340e23c03SJunchao Zhang 
4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5cd620004SJunchao Zhang 
6cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */
7cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope;
840e23c03SJunchao Zhang 
9*fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks.
1040e23c03SJunchao Zhang 
11*fcc7397dSJunchao Zhang   Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing:
12*fcc7397dSJunchao Zhang      p[i] = u[idx[i]], for i in [0,m)
1340e23c03SJunchao Zhang 
14*fcc7397dSJunchao Zhang   Indices are associated with n ranks and each rank's indices are stored consecutively in idx[].
15*fcc7397dSJunchao Zhang   We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in
16*fcc7397dSJunchao Zhang   a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>.
17cd620004SJunchao Zhang 
18*fcc7397dSJunchao Zhang   E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size
19*fcc7397dSJunchao Zhang   is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization.
20*fcc7397dSJunchao Zhang 
21*fcc7397dSJunchao Zhang   Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say
22*fcc7397dSJunchao Zhang   indices in whole are contiguous, and therefore much more useful than this one when true.
2340e23c03SJunchao Zhang  */
2440e23c03SJunchao Zhang struct _n_PetscSFPackOpt {
25*fcc7397dSJunchao Zhang   PetscInt       *array;      /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */
26b23bfdefSJunchao Zhang   PetscInt       n;           /* Number of destination ranks */
27*fcc7397dSJunchao Zhang   PetscInt       *offset;     /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */
28*fcc7397dSJunchao Zhang   PetscInt       *start;      /* [n] First index */
29*fcc7397dSJunchao Zhang   PetscInt       *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */
30*fcc7397dSJunchao Zhang   PetscInt       *X,*Y;       /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */
3140e23c03SJunchao Zhang };
3240e23c03SJunchao Zhang 
33eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers
3440e23c03SJunchao Zhang  */
35*fcc7397dSJunchao Zhang struct _n_PetscSFLink {
36*fcc7397dSJunchao Zhang   PetscErrorCode (*h_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
37*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
38*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
39*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
40*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
41*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
42*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
43*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
44*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
45*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
46*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
47*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
48*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
49*fcc7397dSJunchao Zhang   PetscErrorCode (*h_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
50*fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
51*fcc7397dSJunchao Zhang 
52*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
53*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
54*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
55*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
56*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
57*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
58*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
59*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
60*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
61*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
62*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
63*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
64*fcc7397dSJunchao Zhang   PetscErrorCode (*h_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
65*fcc7397dSJunchao Zhang 
66*fcc7397dSJunchao Zhang   PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
67cd620004SJunchao Zhang 
68cd620004SJunchao Zhang   PetscBool      deviceinited;        /* Are device related fields initialized? */
69eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
70eb02082bSJunchao Zhang   /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF
71eb02082bSJunchao Zhang      will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using
72eb02082bSJunchao Zhang      regular ops when there are no data race chances.
73eb02082bSJunchao Zhang   */
74*fcc7397dSJunchao Zhang   PetscErrorCode (*d_Pack)            (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
75*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
76*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
77*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMin)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
78*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMax)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
79*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
80*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
81*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndMult)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
82*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
83*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBAND)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
84*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
85*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBOR)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
86*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndLXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
87*fcc7397dSJunchao Zhang   PetscErrorCode (*d_UnpackAndBXOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
88*fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAdd)     (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
89eb02082bSJunchao Zhang 
90*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
91*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
92*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
93*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
94*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
95*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
96*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
97*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
98*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
99*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
100*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
101*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
102*fcc7397dSJunchao Zhang   PetscErrorCode (*d_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
103*fcc7397dSJunchao Zhang   PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
104eb02082bSJunchao Zhang 
105eb02082bSJunchao Zhang   /* Packing routines using atomics when there are data race chances */
106*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
107*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
108*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
109*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
110*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
111*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
112*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
113*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
114*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
115*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
116*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
117*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
118*fcc7397dSJunchao Zhang   PetscErrorCode (*da_UnpackAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*);
119*fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAdd)    (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,      void*);
120cd620004SJunchao Zhang 
121*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
122*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndAdd)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
123*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMin)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
124*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMax)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
125*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
126*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
127*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndMult)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
128*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
129*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBAND)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
130*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
131*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBOR)   (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
132*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndLXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
133*fcc7397dSJunchao Zhang   PetscErrorCode (*da_ScatterAndBXOR)  (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*);
134*fcc7397dSJunchao Zhang   PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*);
135eb02082bSJunchao Zhang 
136e315309dSJunchao Zhang   PetscInt     maxResidentThreadsPerGPU;     /* It is a copy from SF for convenience */
137eb02082bSJunchao Zhang   cudaStream_t stream;                       /* Stream to launch pack/unapck kernels if not using the default stream */
138eb02082bSJunchao Zhang #endif
139eb02082bSJunchao Zhang   PetscMPIInt  tag;                          /* Each link has a tag so we can perform multiple SF ops at the same time */
140cd620004SJunchao Zhang   MPI_Datatype unit;                         /* The MPI datatype this PetscSFLink is built for */
141eb02082bSJunchao Zhang   MPI_Datatype basicunit;                    /* unit is made of MPI builtin dataype basicunit */
142e07844bfSJunchao Zhang   PetscBool    isbuiltin;                    /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */
143eb02082bSJunchao Zhang   size_t       unitbytes;                    /* Number of bytes in a unit */
144eb02082bSJunchao Zhang   PetscInt     bs;                           /* Number of basic units in a unit */
145cd620004SJunchao Zhang   const void   *rootdata,*leafdata;          /* rootdata and leafdata the link is working on. They are used as keys for pending links. */
146cd620004SJunchao Zhang   PetscMemType rootmtype,leafmtype;          /* root/leafdata's memory type */
147cd620004SJunchao Zhang 
148cd620004SJunchao Zhang   /* For local and remote communication */
149cd620004SJunchao Zhang   PetscMemType rootmtype_mpi,leafmtype_mpi;  /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */
150cd620004SJunchao Zhang   PetscBool    rootdirect[2],leafdirect[2];  /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */
151cd620004SJunchao Zhang   PetscInt     rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */
152cd620004SJunchao Zhang   const void   *rootdatadirect[2][2];        /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */
153cd620004SJunchao Zhang   const void   *leafdatadirect[2][2];        /* ... We need them to look up links when root/leafdirect_mpi are true */
154cd620004SJunchao Zhang   char         *rootbuf[2][2];               /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
155cd620004SJunchao Zhang   char         *rootbuf_alloc[2][2];         /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */
156cd620004SJunchao Zhang   char         *leafbuf[2][2];               /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */
157cd620004SJunchao Zhang   char         *leafbuf_alloc[2][2];
158cd620004SJunchao Zhang   MPI_Request  *rootreqs[2][2][2];           /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */
159cd620004SJunchao Zhang   MPI_Request  *leafreqs[2][2][2];           /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */
160cd620004SJunchao Zhang   PetscBool    rootreqsinited[2][2][2];      /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/
161cd620004SJunchao Zhang   PetscBool    leafreqsinited[2][2][2];      /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/
162cd620004SJunchao Zhang   MPI_Request  *reqs;                        /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */
163cd620004SJunchao Zhang   PetscSFLink  next;
16440e23c03SJunchao Zhang };
16540e23c03SJunchao Zhang 
166cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG)
167cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*);
168cd620004SJunchao Zhang #else
169cd620004SJunchao Zhang #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0
170cd620004SJunchao Zhang #endif
171b7c0d12aSJunchao Zhang 
172cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */
173cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*);
174cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype);
175cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
176cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype);
177cd620004SJunchao Zhang #else
178cd620004SJunchao Zhang #define PetscSFLinkSetUp_Device(a,b,c) 0
179cd620004SJunchao Zhang #endif
180cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*);
181cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*);
182cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*);
183cd620004SJunchao Zhang 
184cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */
185*fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
186eb02082bSJunchao Zhang {
187eb02082bSJunchao Zhang   PetscFunctionBegin;
188eb02082bSJunchao Zhang   if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack;
189eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
190cd620004SJunchao Zhang   else *Pack = link->d_Pack;
191eb02082bSJunchao Zhang #endif
192eb02082bSJunchao Zhang   PetscFunctionReturn(0);
193eb02082bSJunchao Zhang }
194*fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*));
195*fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*));
196*fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*));
197*fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*));
198cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**);
199b7c0d12aSJunchao Zhang 
200cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */
201cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
202cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData  (PetscSF,PetscSFLink,PetscSFScope,const void*);
203cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
204cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
205cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op);
206cd620004SJunchao Zhang 
207cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
208cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op);
209cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op);
210cd620004SJunchao Zhang 
211cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf);
212cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf);
213cd620004SJunchao Zhang 
214cd620004SJunchao Zhang /* Get root indices used for pack/unpack
215cd620004SJunchao Zhang 
216cd620004SJunchao Zhang Input arguments:
217cd620004SJunchao Zhang   +sf    - StarForest
218cd620004SJunchao Zhang   .link  - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls)
219cd620004SJunchao Zhang   .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE)
220cd620004SJunchao Zhang   .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST)
221cd620004SJunchao Zhang 
222cd620004SJunchao Zhang  Output arguments:
223cd620004SJunchao Zhang   +count   - Count of indices
224cd620004SJunchao Zhang   .start   - The first index (only useful when indices is NULL)
225cd620004SJunchao Zhang   -indices - indices of roots for pack/unpack. NULL means indices are contiguous
226cd620004SJunchao Zhang  */
227*fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
228b7c0d12aSJunchao Zhang {
229cd620004SJunchao Zhang   PetscSF_Basic  *bas = (PetscSF_Basic*)sf->data;
230cd620004SJunchao Zhang   PetscInt       offset;
231b7c0d12aSJunchao Zhang 
232b7c0d12aSJunchao Zhang   PetscFunctionBegin;
233*fcc7397dSJunchao Zhang   *count   = bas->rootbuflen[scope];
234*fcc7397dSJunchao Zhang   *start   = bas->rootstart[scope];
235*fcc7397dSJunchao Zhang   *opt     = NULL;
236*fcc7397dSJunchao Zhang   *indices = NULL;
237*fcc7397dSJunchao Zhang 
238*fcc7397dSJunchao Zhang   /* We have these rules:
239*fcc7397dSJunchao Zhang     1) opt == NULL && indices == NULL ==> indices are contiguous.
240*fcc7397dSJunchao Zhang     2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not
241*fcc7397dSJunchao Zhang        want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device.
242*fcc7397dSJunchao Zhang   */
243*fcc7397dSJunchao Zhang   if (!bas->rootcontig[scope]) {
244cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks];
245*fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;}
246cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
247cd620004SJunchao Zhang     else {
248*fcc7397dSJunchao Zhang       PetscErrorCode ierr;
249cd620004SJunchao Zhang       cudaError_t    cerr;
250*fcc7397dSJunchao Zhang       size_t         size;
251*fcc7397dSJunchao Zhang       if (bas->rootpackopt[scope]) {
252*fcc7397dSJunchao Zhang         if (!bas->rootpackopt_d[scope]) {
253*fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr);
254*fcc7397dSJunchao Zhang           ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */
255*fcc7397dSJunchao Zhang           size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
256*fcc7397dSJunchao Zhang           cerr = cudaMalloc((void **)&bas->rootpackopt_d[scope]->array,size);CHKERRCUDA(cerr);
257*fcc7397dSJunchao Zhang           cerr = cudaMemcpyAsync(bas->rootpackopt_d[scope]->array,bas->rootpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
258*fcc7397dSJunchao Zhang         }
259*fcc7397dSJunchao Zhang         *opt = bas->rootpackopt_d[scope];
260*fcc7397dSJunchao Zhang       } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */
261*fcc7397dSJunchao Zhang         if (!bas->irootloc_d[scope]) {
262*fcc7397dSJunchao Zhang           size = bas->rootbuflen[scope]*sizeof(PetscInt);
263cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr);
264cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
265b7c0d12aSJunchao Zhang         }
266cd620004SJunchao Zhang         *indices = bas->irootloc_d[scope];
267cd620004SJunchao Zhang       }
268cd620004SJunchao Zhang     }
269*fcc7397dSJunchao Zhang #endif
270cd620004SJunchao Zhang   }
271b7c0d12aSJunchao Zhang   PetscFunctionReturn(0);
272b7c0d12aSJunchao Zhang }
273b7c0d12aSJunchao Zhang 
274cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack
275cd620004SJunchao Zhang 
276*fcc7397dSJunchao Zhang   See also PetscSFLinkGetRootPackOptAndIndices()
277cd620004SJunchao Zhang  */
278*fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices)
279cd620004SJunchao Zhang {
280cd620004SJunchao Zhang   PetscInt   offset;
281cd620004SJunchao Zhang 
282cd620004SJunchao Zhang   PetscFunctionBegin;
283*fcc7397dSJunchao Zhang   *count   = sf->leafbuflen[scope];
284*fcc7397dSJunchao Zhang   *start   = sf->leafstart[scope];
285*fcc7397dSJunchao Zhang   *opt     = NULL;
286*fcc7397dSJunchao Zhang   *indices = NULL;
287*fcc7397dSJunchao Zhang   if (!sf->leafcontig[scope]) {
288cd620004SJunchao Zhang     offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks];
289*fcc7397dSJunchao Zhang     if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;}
290eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA)
291cd620004SJunchao Zhang     else {
292*fcc7397dSJunchao Zhang       PetscErrorCode ierr;
293cd620004SJunchao Zhang       cudaError_t    cerr;
294*fcc7397dSJunchao Zhang       size_t         size;
295*fcc7397dSJunchao Zhang       if (sf->leafpackopt[scope]) {
296*fcc7397dSJunchao Zhang         if (!sf->leafpackopt_d[scope]) {
297*fcc7397dSJunchao Zhang           ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr);
298*fcc7397dSJunchao Zhang           ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr);
299*fcc7397dSJunchao Zhang           size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/
300*fcc7397dSJunchao Zhang           cerr = cudaMalloc((void **)&sf->leafpackopt_d[scope]->array,size);CHKERRCUDA(cerr); /* Change ->array to a device pointer */
301*fcc7397dSJunchao Zhang           cerr = cudaMemcpyAsync(sf->leafpackopt_d[scope]->array,sf->leafpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
302*fcc7397dSJunchao Zhang         }
303*fcc7397dSJunchao Zhang         *opt = sf->leafpackopt_d[scope];
304*fcc7397dSJunchao Zhang       } else {
305*fcc7397dSJunchao Zhang         if (!sf->rmine_d[scope]) {
306*fcc7397dSJunchao Zhang           size = sf->leafbuflen[scope]*sizeof(PetscInt);
307cd620004SJunchao Zhang           cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr);
308cd620004SJunchao Zhang           cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
309cd620004SJunchao Zhang         }
310cd620004SJunchao Zhang         *indices = sf->rmine_d[scope];
311cd620004SJunchao Zhang       }
312cd620004SJunchao Zhang     }
313*fcc7397dSJunchao Zhang #endif
314cd620004SJunchao Zhang   }
315cd620004SJunchao Zhang   PetscFunctionReturn(0);
316cd620004SJunchao Zhang }
317cd620004SJunchao Zhang 
318cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction)
319cd620004SJunchao Zhang {
320cd620004SJunchao Zhang   PetscErrorCode       ierr;
321cd620004SJunchao Zhang   PetscSF_Basic        *bas = (PetscSF_Basic*)sf->data;
322cd620004SJunchao Zhang   const PetscMemType   rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi;
323cd620004SJunchao Zhang   const PetscInt       rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
324cd620004SJunchao Zhang 
325cd620004SJunchao Zhang   PetscFunctionBegin;
326cd620004SJunchao Zhang   ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
327cd620004SJunchao Zhang   ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr);
328cd620004SJunchao Zhang   PetscFunctionReturn(0);
329cd620004SJunchao Zhang }
330cd620004SJunchao Zhang 
331f01131f0SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
332f01131f0SJunchao Zhang {
333f01131f0SJunchao Zhang   PetscFunctionBegin;
334f01131f0SJunchao Zhang   if (n) {
335f01131f0SJunchao Zhang     if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);}
336f01131f0SJunchao Zhang #if defined(PETSC_HAVE_CUDA)
337f01131f0SJunchao Zhang     else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_HOST)   {
338f01131f0SJunchao Zhang       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err);
339f01131f0SJunchao Zhang       PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr);
340f01131f0SJunchao Zhang     } else if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_DEVICE) {
341f01131f0SJunchao Zhang       cudaError_t    err  = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err);
342f01131f0SJunchao Zhang       PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr);
343f01131f0SJunchao Zhang     } else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(err);}
344f01131f0SJunchao Zhang #endif
345f01131f0SJunchao Zhang     else SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType for dst %d and src %d",(int)dstmtype,(int)srcmtype);
346f01131f0SJunchao Zhang   }
347f01131f0SJunchao Zhang   PetscFunctionReturn(0);
348f01131f0SJunchao Zhang }
34940e23c03SJunchao Zhang #endif
350