1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5 6 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 7 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 8 9 /* Optimizations in packing & unpacking for destination ranks. 10 11 Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 12 p[i] = u[idx[i]], for i in [0,m) 13 14 Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 15 We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 16 a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 17 18 E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 19 is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 20 21 Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 22 indices in whole are contiguous, and therefore much more useful than this one when true. 23 */ 24 struct _n_PetscSFPackOpt { 25 PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 26 PetscInt n; /* Number of destination ranks */ 27 PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 28 PetscInt *start; /* [n] First index */ 29 PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 30 PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 31 }; 32 33 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 34 */ 35 struct _n_PetscSFLink { 36 PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 37 PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 38 PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 39 PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 40 PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 41 PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 42 PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 43 PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 44 PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 45 PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 46 PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 47 PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 48 PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 49 PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 50 PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 51 52 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 53 PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 54 PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 55 PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 56 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 57 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 58 PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 59 PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 60 PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 61 PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 62 PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 63 PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 64 PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 65 66 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 67 68 PetscBool deviceinited; /* Are device related fields initialized? */ 69 #if defined(PETSC_HAVE_CUDA) 70 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 71 will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using 72 regular ops when there are no data race chances. 73 */ 74 PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 75 PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 76 PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 77 PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 78 PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 79 PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 80 PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 81 PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 82 PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 83 PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 84 PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 85 PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 86 PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 87 PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 88 PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 89 90 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 91 PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 92 PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 93 PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 94 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 95 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 96 PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 97 PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 98 PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 99 PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 100 PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 101 PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 102 PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 103 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 104 105 /* Packing routines using atomics when there are data race chances */ 106 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 107 PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 108 PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 109 PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 110 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 111 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 112 PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 113 PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 114 PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 115 PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 116 PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 117 PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 118 PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 119 PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 120 121 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 122 PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 123 PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 124 PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 125 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 126 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 127 PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 128 PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 129 PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 130 PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 131 PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 132 PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 133 PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 134 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 135 136 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 137 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 138 #endif 139 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 140 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 141 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 142 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 143 size_t unitbytes; /* Number of bytes in a unit */ 144 PetscInt bs; /* Number of basic units in a unit */ 145 const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 146 PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 147 148 /* For local and remote communication */ 149 PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 150 PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 151 PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 152 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 153 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 154 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 155 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 156 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 157 char *leafbuf_alloc[2][2]; 158 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 159 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 160 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 161 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 162 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 163 PetscSFLink next; 164 }; 165 166 #if defined(PETSC_USE_DEBUG) 167 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 168 #else 169 #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0 170 #endif 171 172 /* Create/setup/retrieve/destroy a link */ 173 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 174 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 175 #if defined(PETSC_HAVE_CUDA) 176 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype); 177 #else 178 #define PetscSFLinkSetUp_Device(a,b,c) 0 179 #endif 180 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 181 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 182 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 183 184 /* Get pack/unpack function pointers from a link */ 185 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 186 { 187 PetscFunctionBegin; 188 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 189 #if defined(PETSC_HAVE_CUDA) 190 else *Pack = link->d_Pack; 191 #endif 192 PetscFunctionReturn(0); 193 } 194 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 195 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 196 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 197 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 198 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 199 200 /* Do Pack/Unpack/Fetch/Scatter with the link */ 201 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 202 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 203 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 204 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 205 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 206 207 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 208 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 209 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 210 211 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf); 212 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf); 213 214 /* Get root indices used for pack/unpack 215 216 Input arguments: 217 +sf - StarForest 218 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 219 .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 220 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 221 222 Output arguments: 223 +count - Count of indices 224 .start - The first index (only useful when indices is NULL) 225 -indices - indices of roots for pack/unpack. NULL means indices are contiguous 226 */ 227 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 228 { 229 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 230 PetscInt offset; 231 232 PetscFunctionBegin; 233 *count = bas->rootbuflen[scope]; 234 *start = bas->rootstart[scope]; 235 *opt = NULL; 236 *indices = NULL; 237 238 /* We have these rules: 239 1) opt == NULL && indices == NULL ==> indices are contiguous. 240 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 241 want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 242 */ 243 if (!bas->rootcontig[scope]) { 244 offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 245 if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 246 #if defined(PETSC_HAVE_CUDA) 247 else { 248 PetscErrorCode ierr; 249 cudaError_t cerr; 250 size_t size; 251 if (bas->rootpackopt[scope]) { 252 if (!bas->rootpackopt_d[scope]) { 253 ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 254 ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 255 size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 256 cerr = cudaMalloc((void **)&bas->rootpackopt_d[scope]->array,size);CHKERRCUDA(cerr); 257 cerr = cudaMemcpyAsync(bas->rootpackopt_d[scope]->array,bas->rootpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 258 } 259 *opt = bas->rootpackopt_d[scope]; 260 } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 261 if (!bas->irootloc_d[scope]) { 262 size = bas->rootbuflen[scope]*sizeof(PetscInt); 263 cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr); 264 cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 265 } 266 *indices = bas->irootloc_d[scope]; 267 } 268 } 269 #endif 270 } 271 PetscFunctionReturn(0); 272 } 273 274 /* Get leaf indices used for pack/unpack 275 276 See also PetscSFLinkGetRootPackOptAndIndices() 277 */ 278 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 279 { 280 PetscInt offset; 281 282 PetscFunctionBegin; 283 *count = sf->leafbuflen[scope]; 284 *start = sf->leafstart[scope]; 285 *opt = NULL; 286 *indices = NULL; 287 if (!sf->leafcontig[scope]) { 288 offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 289 if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 290 #if defined(PETSC_HAVE_CUDA) 291 else { 292 PetscErrorCode ierr; 293 cudaError_t cerr; 294 size_t size; 295 if (sf->leafpackopt[scope]) { 296 if (!sf->leafpackopt_d[scope]) { 297 ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 298 ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 299 size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 300 cerr = cudaMalloc((void **)&sf->leafpackopt_d[scope]->array,size);CHKERRCUDA(cerr); /* Change ->array to a device pointer */ 301 cerr = cudaMemcpyAsync(sf->leafpackopt_d[scope]->array,sf->leafpackopt[scope]->array,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 302 } 303 *opt = sf->leafpackopt_d[scope]; 304 } else { 305 if (!sf->rmine_d[scope]) { 306 size = sf->leafbuflen[scope]*sizeof(PetscInt); 307 cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr); 308 cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 309 } 310 *indices = sf->rmine_d[scope]; 311 } 312 } 313 #endif 314 } 315 PetscFunctionReturn(0); 316 } 317 318 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 319 { 320 PetscErrorCode ierr; 321 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 322 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 323 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 324 325 PetscFunctionBegin; 326 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 327 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 332 { 333 PetscFunctionBegin; 334 if (n) { 335 if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 336 #if defined(PETSC_HAVE_CUDA) 337 else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_HOST) { 338 cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 339 PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr); 340 } else if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_DEVICE) { 341 cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err); 342 PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr); 343 } else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(err);} 344 #endif 345 else SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType for dst %d and src %d",(int)dstmtype,(int)srcmtype); 346 } 347 PetscFunctionReturn(0); 348 } 349 #endif 350