1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/ 5 6 /* Optimization plans in packing & unpacking for destination ranks. 7 8 Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing: 9 p[i] = u[idx[i]], for i in [0,count) 10 11 Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[]. 12 We analyze indices for each rank and see if they are patterns that can be used to optimize the packing. 13 The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into 14 a small number of contiguous memory copies or one strided memory copy. 15 */ 16 typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType; 17 18 struct _n_PetscSFPackOpt { 19 PetscInt n; /* Number of destination ranks */ 20 PetscSFPackOptType *type; /* [n] Optimization types for the n ranks */ 21 PetscInt *offset; /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */ 22 PetscInt *copy_offset; /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */ 23 PetscInt *copy_start; /* [*] j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */ 24 PetscInt *copy_length; /* [*] starting at idx[copy_start[j]] */ 25 PetscInt *stride_step; /* [n] If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */ 26 PetscInt *stride_n; /* [n] and total stride_n[i] steps */ 27 }; 28 29 typedef struct _n_PetscSFPack* PetscSFPack; 30 31 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 32 */ 33 struct _n_PetscSFPack { 34 PetscErrorCode (*h_Pack) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 35 PetscErrorCode (*h_UnpackAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 36 PetscErrorCode (*h_UnpackAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 37 PetscErrorCode (*h_UnpackAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 38 PetscErrorCode (*h_UnpackAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 39 PetscErrorCode (*h_UnpackAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 40 PetscErrorCode (*h_UnpackAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 41 PetscErrorCode (*h_UnpackAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 42 PetscErrorCode (*h_UnpackAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 43 PetscErrorCode (*h_UnpackAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 44 PetscErrorCode (*h_UnpackAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 45 PetscErrorCode (*h_UnpackAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 46 PetscErrorCode (*h_UnpackAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 47 PetscErrorCode (*h_UnpackAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 48 PetscErrorCode (*h_FetchAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 49 PetscErrorCode (*h_FetchAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 50 PetscErrorCode (*h_FetchAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 51 PetscErrorCode (*h_FetchAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 52 PetscErrorCode (*h_FetchAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 53 PetscErrorCode (*h_FetchAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 54 PetscErrorCode (*h_FetchAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 55 PetscErrorCode (*h_FetchAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 56 PetscErrorCode (*h_FetchAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 57 PetscErrorCode (*h_FetchAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 58 PetscErrorCode (*h_FetchAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 59 PetscErrorCode (*h_FetchAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 60 PetscErrorCode (*h_FetchAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 61 #if defined(PETSC_HAVE_CUDA) 62 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 63 will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using 64 regular ops when there are no data race chances. 65 */ 66 PetscBool deviceinited; /* Are device related fields initialized? */ 67 PetscErrorCode (*d_Pack) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*); 68 69 PetscErrorCode (*d_UnpackAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 70 PetscErrorCode (*d_UnpackAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 71 PetscErrorCode (*d_UnpackAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 72 PetscErrorCode (*d_UnpackAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 73 PetscErrorCode (*d_UnpackAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 74 PetscErrorCode (*d_UnpackAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 75 PetscErrorCode (*d_UnpackAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 76 PetscErrorCode (*d_UnpackAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 77 PetscErrorCode (*d_UnpackAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 78 PetscErrorCode (*d_UnpackAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 79 PetscErrorCode (*d_UnpackAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 80 PetscErrorCode (*d_UnpackAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 81 PetscErrorCode (*d_UnpackAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 82 PetscErrorCode (*d_FetchAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 83 PetscErrorCode (*d_FetchAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 84 PetscErrorCode (*d_FetchAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 85 PetscErrorCode (*d_FetchAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 86 PetscErrorCode (*d_FetchAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 87 PetscErrorCode (*d_FetchAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 88 PetscErrorCode (*d_FetchAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 89 PetscErrorCode (*d_FetchAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 90 PetscErrorCode (*d_FetchAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 91 PetscErrorCode (*d_FetchAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 92 PetscErrorCode (*d_FetchAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 93 PetscErrorCode (*d_FetchAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 94 PetscErrorCode (*d_FetchAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 95 96 /* Packing routines using atomics when there are data race chances */ 97 PetscErrorCode (*da_UnpackAndInsert)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 98 PetscErrorCode (*da_UnpackAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 99 PetscErrorCode (*da_UnpackAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 100 PetscErrorCode (*da_UnpackAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 101 PetscErrorCode (*da_UnpackAndMinloc)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 102 PetscErrorCode (*da_UnpackAndMaxloc)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 103 PetscErrorCode (*da_UnpackAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 104 PetscErrorCode (*da_UnpackAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 105 PetscErrorCode (*da_UnpackAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 106 PetscErrorCode (*da_UnpackAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 107 PetscErrorCode (*da_UnpackAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 108 PetscErrorCode (*da_UnpackAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 109 PetscErrorCode (*da_UnpackAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*); 110 PetscErrorCode (*da_FetchAndInsert) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 111 PetscErrorCode (*da_FetchAndAdd) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 112 PetscErrorCode (*da_FetchAndMin) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 113 PetscErrorCode (*da_FetchAndMax) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 114 PetscErrorCode (*da_FetchAndMinloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 115 PetscErrorCode (*da_FetchAndMaxloc) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 116 PetscErrorCode (*da_FetchAndMult) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 117 PetscErrorCode (*da_FetchAndLAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 118 PetscErrorCode (*da_FetchAndBAND) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 119 PetscErrorCode (*da_FetchAndLOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 120 PetscErrorCode (*da_FetchAndBOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 121 PetscErrorCode (*da_FetchAndLXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 122 PetscErrorCode (*da_FetchAndBXOR) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*, void*); 123 124 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 125 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 126 #endif 127 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 128 MPI_Datatype unit; /* The MPI datatype this PetscSFPack is built for */ 129 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 130 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 131 size_t unitbytes; /* Number of bytes in a unit */ 132 PetscInt bs; /* Number of basic units in a unit */ 133 const void *rootdata,*leafdata; /* rootdata and leafdata used as keys for operation */ 134 char *rootbuf[2]; /* Buffer for packed roots on Host (0 or PETSC_MEMTYPE_HOST) or Device (1 or PETSC_MEMTYPE_DEVICE) */ 135 char *leafbuf[2]; /* Buffer for packed leaves on Host (0) or Device (1) */ 136 char *selfbuf[2]; /* Buffer for roots in self to self communication on Host (0) or Device (1) */ 137 PetscInt rootbuflen; /* Length of root buffer in <unit> */ 138 PetscInt leafbuflen; /* Length of leaf buffer in <unit> */ 139 PetscInt selfbuflen; /* Length of self buffer in <unit> */ 140 PetscMemType rootmtype; /* rootdata's memory type */ 141 PetscMemType leafmtype; /* leafdata's memory type */ 142 PetscMPIInt nrootreqs; /* Number of root requests */ 143 PetscMPIInt nleafreqs; /* Number of leaf requests */ 144 MPI_Request *rootreqs[2][2]; /* Pointers to root requests in this layout [PETSCSF_DIRECTION][PETSC_MEMTYPE] */ 145 MPI_Request *leafreqs[2][2]; /* Pointers to leaf requests in this layout [PETSCSF_DIRECTION][PETSC_MEMTYPE] */ 146 PetscBool rootreqsinited[2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]*/ 147 PetscBool leafreqsinited[2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]*/ 148 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*4. Pointers in rootreqs[][] and leafreqs[][] point here */ 149 PetscSFPack next; 150 }; 151 152 PETSC_INTERN PetscErrorCode PetscSFPackGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFPack*); 153 PETSC_INTERN PetscErrorCode PetscSFPackReclaim(PetscSF,PetscSFPack*); 154 PETSC_INTERN PetscErrorCode PetscSFPackDestroyAvailable(PetscSF,PetscSFPack*); 155 156 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackGetPack(PetscSFPack link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,const void*,void*)) 157 { 158 PetscFunctionBegin; 159 *Pack = NULL; 160 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 161 #if defined(PETSC_HAVE_CUDA) 162 else if (mtype == PETSC_MEMTYPE_DEVICE) *Pack = link->d_Pack; 163 #endif 164 else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType %d",(int)mtype); 165 PetscFunctionReturn(0); 166 } 167 PETSC_INTERN PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSFPack,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,const void*)); 168 PETSC_INTERN PetscErrorCode PetscSFPackGetFetchAndOp (PetscSFPack,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscInt,const PetscInt*,PetscSFPack,PetscSFPackOpt,void*,void*)); 169 PETSC_INTERN PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 170 171 PETSC_STATIC_INLINE PetscErrorCode PetscSFPackWaitall(PetscSFPack link,PetscSFDirection direction) 172 { 173 PetscErrorCode ierr; 174 PetscMemType rootmtype,leafmtype; 175 176 PetscFunctionBegin; 177 if (use_gpu_aware_mpi) { 178 rootmtype = link->rootmtype; 179 leafmtype = link->leafmtype; 180 } else { 181 rootmtype = PETSC_MEMTYPE_HOST; 182 leafmtype = PETSC_MEMTYPE_HOST; 183 } 184 ierr = MPI_Waitall(link->nrootreqs,link->rootreqs[direction][rootmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 185 ierr = MPI_Waitall(link->nleafreqs,link->leafreqs[direction][leafmtype],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 186 PetscFunctionReturn(0); 187 } 188 189 PETSC_INTERN PetscErrorCode PetscSFPackSetUp_Host(PetscSF,PetscSFPack,MPI_Datatype); 190 #if defined(PETSC_HAVE_CUDA) 191 PETSC_INTERN PetscErrorCode PetscSFPackSetUp_Device(PetscSF,PetscSFPack,MPI_Datatype); 192 #endif 193 PETSC_INTERN PetscErrorCode PetscSFPackOptCreate(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*); 194 PETSC_INTERN PetscErrorCode PetscSFPackOptDestroy(PetscSFPackOpt *out); 195 #endif 196