140e23c03SJunchao Zhang #if !defined(__SFPACK_H) 240e23c03SJunchao Zhang #define __SFPACK_H 340e23c03SJunchao Zhang 4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 6*7fd2d3dbSJunchao Zhang #include <cuda_runtime.h> /* For cudaStream_t */ 7*7fd2d3dbSJunchao Zhang #include <petsccublas.h> /* For CHKERRCUDA */ 8*7fd2d3dbSJunchao Zhang #endif 9*7fd2d3dbSJunchao Zhang 10*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP) 11*7fd2d3dbSJunchao Zhang #include <hip/hip_runtime.h> /* For hipStream_t */ 12*7fd2d3dbSJunchao Zhang #endif 13cd620004SJunchao Zhang 14cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 15cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 1640e23c03SJunchao Zhang 17fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks. 1840e23c03SJunchao Zhang 19fcc7397dSJunchao Zhang Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 20fcc7397dSJunchao Zhang p[i] = u[idx[i]], for i in [0,m) 2140e23c03SJunchao Zhang 22fcc7397dSJunchao Zhang Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 23fcc7397dSJunchao Zhang We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 24fcc7397dSJunchao Zhang a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 25cd620004SJunchao Zhang 26fcc7397dSJunchao Zhang E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 27fcc7397dSJunchao Zhang is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 28fcc7397dSJunchao Zhang 29fcc7397dSJunchao Zhang Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 30fcc7397dSJunchao Zhang indices in whole are contiguous, and therefore much more useful than this one when true. 3140e23c03SJunchao Zhang */ 3240e23c03SJunchao Zhang struct _n_PetscSFPackOpt { 33fcc7397dSJunchao Zhang PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 34b23bfdefSJunchao Zhang PetscInt n; /* Number of destination ranks */ 35fcc7397dSJunchao Zhang PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 36fcc7397dSJunchao Zhang PetscInt *start; /* [n] First index */ 37fcc7397dSJunchao Zhang PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 38fcc7397dSJunchao Zhang PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 3940e23c03SJunchao Zhang }; 4040e23c03SJunchao Zhang 41eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 4240e23c03SJunchao Zhang */ 43fcc7397dSJunchao Zhang struct _n_PetscSFLink { 44fcc7397dSJunchao Zhang PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 45fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 46fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 47fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 48fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 49fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 50fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 51fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 52fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 53fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 54fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 55fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 56fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 57fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 58fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 59fcc7397dSJunchao Zhang 60fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 61fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 62fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 63fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 64fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 65fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 66fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 67fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 68fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 69fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 70fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 71fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 72fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 73fcc7397dSJunchao Zhang 74fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 75cd620004SJunchao Zhang 76cd620004SJunchao Zhang PetscBool deviceinited; /* Are device related fields initialized? */ 77*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 78eb02082bSJunchao Zhang /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 79*7fd2d3dbSJunchao Zhang will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 80eb02082bSJunchao Zhang */ 81fcc7397dSJunchao Zhang PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 82fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 83fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 84fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 85fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 86fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 87fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 88fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 89fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 90fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 91fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 92fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 93fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 94fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 95fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 96eb02082bSJunchao Zhang 97fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 98fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 99fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 100fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 101fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 102fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 103fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 104fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 105fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 106fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 107fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 108fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 109fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 110fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 111eb02082bSJunchao Zhang 112eb02082bSJunchao Zhang /* Packing routines using atomics when there are data race chances */ 113fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 114fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 115fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 116fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 117fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 118fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 119fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 120fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 121fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 122fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 123fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 124fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 125fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 126fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 127cd620004SJunchao Zhang 128fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 129fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 130fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 131fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 132fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 133fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 134fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 135fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 136fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 137fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 138fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 139fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 140fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 141fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 142*7fd2d3dbSJunchao Zhang #if defined (PETSC_HAVE_CUDA) 143e315309dSJunchao Zhang PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 144eb02082bSJunchao Zhang cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 145*7fd2d3dbSJunchao Zhang #elif defined (PETSC_HAVE_HIP) 146*7fd2d3dbSJunchao Zhang hipStream_t stream; 147eb02082bSJunchao Zhang #endif 148*7fd2d3dbSJunchao Zhang 149*7fd2d3dbSJunchao Zhang PetscErrorCode (*Destroy)(PetscSFLink); /* Device specific destroy function */ 150*7fd2d3dbSJunchao Zhang void *sptr; 151*7fd2d3dbSJunchao Zhang #endif 152*7fd2d3dbSJunchao Zhang 153eb02082bSJunchao Zhang PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 154cd620004SJunchao Zhang MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 155eb02082bSJunchao Zhang MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 156e07844bfSJunchao Zhang PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 157eb02082bSJunchao Zhang size_t unitbytes; /* Number of bytes in a unit */ 158eb02082bSJunchao Zhang PetscInt bs; /* Number of basic units in a unit */ 159cd620004SJunchao Zhang const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 160cd620004SJunchao Zhang PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 161cd620004SJunchao Zhang 162cd620004SJunchao Zhang /* For local and remote communication */ 163cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 164cd620004SJunchao Zhang PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 165cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 166cd620004SJunchao Zhang const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 167cd620004SJunchao Zhang const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 168cd620004SJunchao Zhang char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 169cd620004SJunchao Zhang char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 170cd620004SJunchao Zhang char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 171cd620004SJunchao Zhang char *leafbuf_alloc[2][2]; 172cd620004SJunchao Zhang MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 173cd620004SJunchao Zhang MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 174cd620004SJunchao Zhang PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 175cd620004SJunchao Zhang PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 176cd620004SJunchao Zhang MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 177cd620004SJunchao Zhang PetscSFLink next; 17840e23c03SJunchao Zhang }; 17940e23c03SJunchao Zhang 180cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 181b7c0d12aSJunchao Zhang 182cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */ 183cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 184cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 185cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 186cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 187cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 188cd620004SJunchao Zhang 189cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */ 190fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 191eb02082bSJunchao Zhang { 192eb02082bSJunchao Zhang PetscFunctionBegin; 193eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 194*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 195cd620004SJunchao Zhang else *Pack = link->d_Pack; 196eb02082bSJunchao Zhang #endif 197eb02082bSJunchao Zhang PetscFunctionReturn(0); 198eb02082bSJunchao Zhang } 199*7fd2d3dbSJunchao Zhang 200*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 201*7fd2d3dbSJunchao Zhang { 202*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 203*7fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 204*7fd2d3dbSJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 205*7fd2d3dbSJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 206*7fd2d3dbSJunchao Zhang 207*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 208*7fd2d3dbSJunchao Zhang ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 209*7fd2d3dbSJunchao Zhang ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 210*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 211*7fd2d3dbSJunchao Zhang } 212*7fd2d3dbSJunchao Zhang 213fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 214fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 215fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 216fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 217cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 218b7c0d12aSJunchao Zhang 219cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */ 220cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 221cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 222cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 223cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 224cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 225cd620004SJunchao Zhang 226cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 227cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 228cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 229cd620004SJunchao Zhang 230*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 231*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 232*7fd2d3dbSJunchao Zhang 233*7fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 234*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 235*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype); 236*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSyncDevice(PetscSF,PetscSFLink); 237*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSyncStream(PetscSF,PetscSFLink); 238*7fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkMemcpy(PetscSF,PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); 239*7fd2d3dbSJunchao Zhang 240*7fd2d3dbSJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 241*7fd2d3dbSJunchao Zhang make sure the data is ready for packing. 242*7fd2d3dbSJunchao Zhang */ 243*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 244*7fd2d3dbSJunchao Zhang { 245*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 246*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 247*7fd2d3dbSJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 248*7fd2d3dbSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscSFLinkSyncDevice(sf,link);CHKERRQ(ierr);} 249*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 250*7fd2d3dbSJunchao Zhang } 251*7fd2d3dbSJunchao Zhang 252*7fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 253*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 254*7fd2d3dbSJunchao Zhang { 255*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 256*7fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 257*7fd2d3dbSJunchao Zhang 258*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 259*7fd2d3dbSJunchao Zhang /* Do nothing if we use stream aware mpi || has nothing for remote */ 260*7fd2d3dbSJunchao Zhang if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 261*7fd2d3dbSJunchao Zhang /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 262*7fd2d3dbSJunchao Zhang if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 263*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 264*7fd2d3dbSJunchao Zhang } 265*7fd2d3dbSJunchao Zhang 266*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 267*7fd2d3dbSJunchao Zhang { 268*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 269*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 270*7fd2d3dbSJunchao Zhang /* See comments above */ 271*7fd2d3dbSJunchao Zhang if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 272*7fd2d3dbSJunchao Zhang if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 273*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 274*7fd2d3dbSJunchao Zhang } 275*7fd2d3dbSJunchao Zhang 276*7fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 277*7fd2d3dbSJunchao Zhang does not know which stream the callers will use. 278*7fd2d3dbSJunchao Zhang */ 279*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 280*7fd2d3dbSJunchao Zhang { 281*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 282*7fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 283*7fd2d3dbSJunchao Zhang PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 284*7fd2d3dbSJunchao Zhang 285*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 286*7fd2d3dbSJunchao Zhang /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 287*7fd2d3dbSJunchao Zhang if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 288*7fd2d3dbSJunchao Zhang 289*7fd2d3dbSJunchao Zhang /* If rootmtype is HOST or DEVICE: 290*7fd2d3dbSJunchao Zhang If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 291*7fd2d3dbSJunchao Zhang If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 292*7fd2d3dbSJunchao Zhang i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 293*7fd2d3dbSJunchao Zhang 294*7fd2d3dbSJunchao Zhang Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 295*7fd2d3dbSJunchao Zhang CPU thread might use the yet-to-be-updated rootdata pending in the stream. 296*7fd2d3dbSJunchao Zhang */ 297*7fd2d3dbSJunchao Zhang if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 298*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 299*7fd2d3dbSJunchao Zhang } 300*7fd2d3dbSJunchao Zhang 301*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 302*7fd2d3dbSJunchao Zhang { 303*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 304*7fd2d3dbSJunchao Zhang PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 305*7fd2d3dbSJunchao Zhang 306*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 307*7fd2d3dbSJunchao Zhang /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 308*7fd2d3dbSJunchao Zhang if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 309*7fd2d3dbSJunchao Zhang if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 310*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 311*7fd2d3dbSJunchao Zhang } 312*7fd2d3dbSJunchao Zhang 313*7fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 314*7fd2d3dbSJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 315*7fd2d3dbSJunchao Zhang */ 316*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 317*7fd2d3dbSJunchao Zhang { 318*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 319*7fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 320*7fd2d3dbSJunchao Zhang 321*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 322*7fd2d3dbSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 323*7fd2d3dbSJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 324*7fd2d3dbSJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 325*7fd2d3dbSJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 326*7fd2d3dbSJunchao Zhang if (device2host) { 327*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 328*7fd2d3dbSJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 329*7fd2d3dbSJunchao Zhang } else { 330*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 331*7fd2d3dbSJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 332*7fd2d3dbSJunchao Zhang } 333*7fd2d3dbSJunchao Zhang } 334*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 335*7fd2d3dbSJunchao Zhang } 336*7fd2d3dbSJunchao Zhang 337*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 338*7fd2d3dbSJunchao Zhang { 339*7fd2d3dbSJunchao Zhang PetscErrorCode ierr; 340*7fd2d3dbSJunchao Zhang 341*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 342*7fd2d3dbSJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 343*7fd2d3dbSJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 344*7fd2d3dbSJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 345*7fd2d3dbSJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 346*7fd2d3dbSJunchao Zhang if (device2host) { 347*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 348*7fd2d3dbSJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 349*7fd2d3dbSJunchao Zhang } else { 350*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 351*7fd2d3dbSJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 352*7fd2d3dbSJunchao Zhang } 353*7fd2d3dbSJunchao Zhang } 354*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 355*7fd2d3dbSJunchao Zhang } 356*7fd2d3dbSJunchao Zhang 357*7fd2d3dbSJunchao Zhang #else /* Host only */ 358*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 359*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 360*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 361*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 362*7fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 363*7fd2d3dbSJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 364*7fd2d3dbSJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 365*7fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 366*7fd2d3dbSJunchao Zhang { 367*7fd2d3dbSJunchao Zhang PetscFunctionBegin; 368*7fd2d3dbSJunchao Zhang if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 369*7fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 370*7fd2d3dbSJunchao Zhang } 371*7fd2d3dbSJunchao Zhang #endif 372cd620004SJunchao Zhang 373cd620004SJunchao Zhang /* Get root indices used for pack/unpack 374cd620004SJunchao Zhang 375cd620004SJunchao Zhang Input arguments: 376cd620004SJunchao Zhang +sf - StarForest 377cd620004SJunchao Zhang .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 378cd620004SJunchao Zhang .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 379cd620004SJunchao Zhang .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 380cd620004SJunchao Zhang 381cd620004SJunchao Zhang Output arguments: 382cd620004SJunchao Zhang +count - Count of indices 383cd620004SJunchao Zhang .start - The first index (only useful when indices is NULL) 384cd620004SJunchao Zhang -indices - indices of roots for pack/unpack. NULL means indices are contiguous 385cd620004SJunchao Zhang */ 386fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 387b7c0d12aSJunchao Zhang { 388cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 389cd620004SJunchao Zhang PetscInt offset; 390b7c0d12aSJunchao Zhang 391b7c0d12aSJunchao Zhang PetscFunctionBegin; 392fcc7397dSJunchao Zhang *count = bas->rootbuflen[scope]; 393fcc7397dSJunchao Zhang *start = bas->rootstart[scope]; 394fcc7397dSJunchao Zhang *opt = NULL; 395fcc7397dSJunchao Zhang *indices = NULL; 396fcc7397dSJunchao Zhang 397fcc7397dSJunchao Zhang /* We have these rules: 398fcc7397dSJunchao Zhang 1) opt == NULL && indices == NULL ==> indices are contiguous. 399fcc7397dSJunchao Zhang 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 400fcc7397dSJunchao Zhang want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 401fcc7397dSJunchao Zhang */ 402fcc7397dSJunchao Zhang if (!bas->rootcontig[scope]) { 403cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 404fcc7397dSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 405*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 406cd620004SJunchao Zhang else { 407fcc7397dSJunchao Zhang PetscErrorCode ierr; 408fcc7397dSJunchao Zhang size_t size; 409fcc7397dSJunchao Zhang if (bas->rootpackopt[scope]) { 410fcc7397dSJunchao Zhang if (!bas->rootpackopt_d[scope]) { 411fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 412fcc7397dSJunchao Zhang ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 413fcc7397dSJunchao Zhang size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 414*7fd2d3dbSJunchao Zhang ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr); 415*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr); 416fcc7397dSJunchao Zhang } 417fcc7397dSJunchao Zhang *opt = bas->rootpackopt_d[scope]; 418fcc7397dSJunchao Zhang } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 419fcc7397dSJunchao Zhang if (!bas->irootloc_d[scope]) { 420fcc7397dSJunchao Zhang size = bas->rootbuflen[scope]*sizeof(PetscInt); 421*7fd2d3dbSJunchao Zhang ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr); 422*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr); 423b7c0d12aSJunchao Zhang } 424cd620004SJunchao Zhang *indices = bas->irootloc_d[scope]; 425cd620004SJunchao Zhang } 426cd620004SJunchao Zhang } 427fcc7397dSJunchao Zhang #endif 428cd620004SJunchao Zhang } 429b7c0d12aSJunchao Zhang PetscFunctionReturn(0); 430b7c0d12aSJunchao Zhang } 431b7c0d12aSJunchao Zhang 432cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack 433cd620004SJunchao Zhang 434fcc7397dSJunchao Zhang See also PetscSFLinkGetRootPackOptAndIndices() 435cd620004SJunchao Zhang */ 436fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 437cd620004SJunchao Zhang { 438cd620004SJunchao Zhang PetscInt offset; 439cd620004SJunchao Zhang 440cd620004SJunchao Zhang PetscFunctionBegin; 441fcc7397dSJunchao Zhang *count = sf->leafbuflen[scope]; 442fcc7397dSJunchao Zhang *start = sf->leafstart[scope]; 443fcc7397dSJunchao Zhang *opt = NULL; 444fcc7397dSJunchao Zhang *indices = NULL; 445fcc7397dSJunchao Zhang if (!sf->leafcontig[scope]) { 446cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 447fcc7397dSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 448*7fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 449cd620004SJunchao Zhang else { 450fcc7397dSJunchao Zhang PetscErrorCode ierr; 451fcc7397dSJunchao Zhang size_t size; 452fcc7397dSJunchao Zhang if (sf->leafpackopt[scope]) { 453fcc7397dSJunchao Zhang if (!sf->leafpackopt_d[scope]) { 454fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 455fcc7397dSJunchao Zhang ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 456fcc7397dSJunchao Zhang size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 457*7fd2d3dbSJunchao Zhang ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */ 458*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr); 459fcc7397dSJunchao Zhang } 460fcc7397dSJunchao Zhang *opt = sf->leafpackopt_d[scope]; 461fcc7397dSJunchao Zhang } else { 462fcc7397dSJunchao Zhang if (!sf->rmine_d[scope]) { 463fcc7397dSJunchao Zhang size = sf->leafbuflen[scope]*sizeof(PetscInt); 464*7fd2d3dbSJunchao Zhang ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr); 465*7fd2d3dbSJunchao Zhang ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr); 466cd620004SJunchao Zhang } 467cd620004SJunchao Zhang *indices = sf->rmine_d[scope]; 468cd620004SJunchao Zhang } 469cd620004SJunchao Zhang } 470fcc7397dSJunchao Zhang #endif 471cd620004SJunchao Zhang } 472cd620004SJunchao Zhang PetscFunctionReturn(0); 473cd620004SJunchao Zhang } 47440e23c03SJunchao Zhang #endif 475