1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> /* For cudaStream_t */ 7 #include <petsccublas.h> /* For CHKERRCUDA */ 8 #endif 9 10 #if defined(PETSC_HAVE_HIP) 11 #include <hip/hip_runtime.h> /* For hipStream_t */ 12 #endif 13 14 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 15 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 16 17 /* Optimizations in packing & unpacking for destination ranks. 18 19 Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 20 p[i] = u[idx[i]], for i in [0,m) 21 22 Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 23 We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 24 a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 25 26 E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 27 is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 28 29 Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 30 indices in whole are contiguous, and therefore much more useful than this one when true. 31 */ 32 struct _n_PetscSFPackOpt { 33 PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 34 PetscInt n; /* Number of destination ranks */ 35 PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 36 PetscInt *start; /* [n] First index */ 37 PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 38 PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 39 }; 40 41 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 42 */ 43 struct _n_PetscSFLink { 44 PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 45 PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 46 PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 47 PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 48 PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 49 PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 50 PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 51 PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 52 PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 53 PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 54 PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 55 PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 56 PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 57 PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 58 PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 59 60 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 61 PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 62 PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 63 PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 64 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 65 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 66 PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 67 PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 68 PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 69 PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 70 PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 71 PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 72 PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 73 74 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 75 76 PetscBool deviceinited; /* Are device related fields initialized? */ 77 #if defined(PETSC_HAVE_DEVICE) 78 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 79 will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 80 */ 81 PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 82 PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 83 PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 84 PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 85 PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 86 PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 87 PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 88 PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 89 PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 90 PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 91 PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 92 PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 93 PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 94 PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 95 PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 96 97 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 98 PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 99 PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 100 PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 101 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 102 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 103 PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 104 PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 105 PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 106 PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 107 PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 108 PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 109 PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 110 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 111 112 /* Packing routines using atomics when there are data race chances */ 113 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 114 PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 115 PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 116 PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 117 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 118 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 119 PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 120 PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 121 PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 122 PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 123 PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 124 PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 125 PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 126 PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 127 128 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 129 PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 130 PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 131 PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 132 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 133 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 134 PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 135 PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 136 PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 137 PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 138 PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 139 PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 140 PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 141 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 142 #if defined (PETSC_HAVE_CUDA) 143 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 144 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 145 #elif defined (PETSC_HAVE_HIP) 146 hipStream_t stream; 147 #endif 148 149 PetscErrorCode (*Destroy)(PetscSFLink); /* Device specific destroy function */ 150 void *sptr; 151 #endif 152 153 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 154 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 155 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 156 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 157 size_t unitbytes; /* Number of bytes in a unit */ 158 PetscInt bs; /* Number of basic units in a unit */ 159 const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 160 PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 161 162 /* For local and remote communication */ 163 PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 164 PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 165 PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 166 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 167 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 168 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 169 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 170 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 171 char *leafbuf_alloc[2][2]; 172 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 173 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 174 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 175 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 176 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 177 PetscSFLink next; 178 }; 179 180 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 181 182 /* Create/setup/retrieve/destroy a link */ 183 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 184 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 185 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 186 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 187 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 188 189 /* Get pack/unpack function pointers from a link */ 190 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 191 { 192 PetscFunctionBegin; 193 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 194 #if defined(PETSC_HAVE_DEVICE) 195 else *Pack = link->d_Pack; 196 #endif 197 PetscFunctionReturn(0); 198 } 199 200 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 201 { 202 PetscErrorCode ierr; 203 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 204 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 205 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 206 207 PetscFunctionBegin; 208 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 209 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 210 PetscFunctionReturn(0); 211 } 212 213 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 214 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 215 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 216 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 217 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 218 219 /* Do Pack/Unpack/Fetch/Scatter with the link */ 220 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 221 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 222 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 223 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 224 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 225 226 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 227 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 228 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 229 230 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 231 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 232 233 /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 234 #if defined(PETSC_HAVE_DEVICE) 235 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype); 236 PETSC_INTERN PetscErrorCode PetscSFLinkSyncDevice(PetscSF,PetscSFLink); 237 PETSC_INTERN PetscErrorCode PetscSFLinkSyncStream(PetscSF,PetscSFLink); 238 PETSC_INTERN PetscErrorCode PetscSFLinkMemcpy(PetscSF,PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); 239 240 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 241 make sure the data is ready for packing. 242 */ 243 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 244 { 245 PetscErrorCode ierr; 246 PetscFunctionBegin; 247 if (sf->use_default_stream) PetscFunctionReturn(0); 248 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscSFLinkSyncDevice(sf,link);CHKERRQ(ierr);} 249 PetscFunctionReturn(0); 250 } 251 252 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 253 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 254 { 255 PetscErrorCode ierr; 256 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 257 258 PetscFunctionBegin; 259 /* Do nothing if we use stream aware mpi || has nothing for remote */ 260 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 261 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 262 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 263 PetscFunctionReturn(0); 264 } 265 266 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 267 { 268 PetscErrorCode ierr; 269 PetscFunctionBegin; 270 /* See comments above */ 271 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 272 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 273 PetscFunctionReturn(0); 274 } 275 276 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 277 does not know which stream the callers will use. 278 */ 279 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 280 { 281 PetscErrorCode ierr; 282 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 283 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 284 285 PetscFunctionBegin; 286 /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 287 if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 288 289 /* If rootmtype is HOST or DEVICE: 290 If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 291 If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 292 i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 293 294 Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 295 CPU thread might use the yet-to-be-updated rootdata pending in the stream. 296 */ 297 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 298 PetscFunctionReturn(0); 299 } 300 301 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 302 { 303 PetscErrorCode ierr; 304 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 305 306 PetscFunctionBegin; 307 /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 308 if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 309 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = PetscSFLinkSyncStream(sf,link);CHKERRQ(ierr);} 310 PetscFunctionReturn(0); 311 } 312 313 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 314 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 315 */ 316 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 317 { 318 PetscErrorCode ierr; 319 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 320 321 PetscFunctionBegin; 322 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 323 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 324 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 325 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 326 if (device2host) { 327 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 328 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 329 } else { 330 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 331 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 332 } 333 } 334 PetscFunctionReturn(0); 335 } 336 337 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 338 { 339 PetscErrorCode ierr; 340 341 PetscFunctionBegin; 342 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 343 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 344 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 345 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 346 if (device2host) { 347 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 348 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 349 } else { 350 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 351 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 352 } 353 } 354 PetscFunctionReturn(0); 355 } 356 357 #else /* Host only */ 358 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 359 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 360 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 361 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 362 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 363 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 364 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 365 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 366 { 367 PetscFunctionBegin; 368 if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 369 PetscFunctionReturn(0); 370 } 371 #endif 372 373 /* Get root indices used for pack/unpack 374 375 Input arguments: 376 +sf - StarForest 377 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 378 .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 379 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 380 381 Output arguments: 382 +count - Count of indices 383 .start - The first index (only useful when indices is NULL) 384 -indices - indices of roots for pack/unpack. NULL means indices are contiguous 385 */ 386 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 387 { 388 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 389 PetscInt offset; 390 391 PetscFunctionBegin; 392 *count = bas->rootbuflen[scope]; 393 *start = bas->rootstart[scope]; 394 *opt = NULL; 395 *indices = NULL; 396 397 /* We have these rules: 398 1) opt == NULL && indices == NULL ==> indices are contiguous. 399 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 400 want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 401 */ 402 if (!bas->rootcontig[scope]) { 403 offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 404 if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 405 #if defined(PETSC_HAVE_DEVICE) 406 else { 407 PetscErrorCode ierr; 408 size_t size; 409 if (bas->rootpackopt[scope]) { 410 if (!bas->rootpackopt_d[scope]) { 411 ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 412 ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 413 size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 414 ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr); 415 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr); 416 } 417 *opt = bas->rootpackopt_d[scope]; 418 } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 419 if (!bas->irootloc_d[scope]) { 420 size = bas->rootbuflen[scope]*sizeof(PetscInt); 421 ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr); 422 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr); 423 } 424 *indices = bas->irootloc_d[scope]; 425 } 426 } 427 #endif 428 } 429 PetscFunctionReturn(0); 430 } 431 432 /* Get leaf indices used for pack/unpack 433 434 See also PetscSFLinkGetRootPackOptAndIndices() 435 */ 436 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 437 { 438 PetscInt offset; 439 440 PetscFunctionBegin; 441 *count = sf->leafbuflen[scope]; 442 *start = sf->leafstart[scope]; 443 *opt = NULL; 444 *indices = NULL; 445 if (!sf->leafcontig[scope]) { 446 offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 447 if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 448 #if defined(PETSC_HAVE_DEVICE) 449 else { 450 PetscErrorCode ierr; 451 size_t size; 452 if (sf->leafpackopt[scope]) { 453 if (!sf->leafpackopt_d[scope]) { 454 ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 455 ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 456 size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 457 ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */ 458 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr); 459 } 460 *opt = sf->leafpackopt_d[scope]; 461 } else { 462 if (!sf->rmine_d[scope]) { 463 size = sf->leafbuflen[scope]*sizeof(PetscInt); 464 ierr = PetscSFMalloc(PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr); 465 ierr = PetscSFLinkMemcpy(sf,link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr); 466 } 467 *indices = sf->rmine_d[scope]; 468 } 469 } 470 #endif 471 } 472 PetscFunctionReturn(0); 473 } 474 #endif 475