1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> /* For cudaStream_t */ 7 #include <petsccublas.h> /* For CHKERRCUDA */ 8 #endif 9 10 #if defined(PETSC_HAVE_HIP) 11 #include <hip/hip_runtime.h> /* For hipStream_t */ 12 #endif 13 14 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to. 15 We perfer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int. 16 */ 17 typedef long long int llint; 18 typedef unsigned long long int ullint; 19 20 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 21 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 22 23 /* Optimizations in packing & unpacking for destination ranks. 24 25 Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 26 p[i] = u[idx[i]], for i in [0,m) 27 28 Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 29 We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 30 a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 31 32 E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 33 is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 34 35 Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 36 indices in whole are contiguous, and therefore much more useful than this one when true. 37 */ 38 struct _n_PetscSFPackOpt { 39 PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 40 PetscInt n; /* Number of destination ranks */ 41 PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 42 PetscInt *start; /* [n] First index */ 43 PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 44 PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 45 }; 46 47 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 48 */ 49 struct _n_PetscSFLink { 50 PetscErrorCode (*Memcpy) (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Asynchronous copy might use stream in the link */ 51 52 PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 53 PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 54 PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 55 PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 56 PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 57 PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 58 PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 59 PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 60 PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 61 PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 62 PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 63 PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 64 PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 65 PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 66 PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 67 68 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 69 PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 70 PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 71 PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 72 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 73 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 74 PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 75 PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 76 PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 77 PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 78 PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 79 PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 80 PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 81 82 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 83 84 PetscBool deviceinited; /* Are device related fields initialized? */ 85 #if defined(PETSC_HAVE_DEVICE) 86 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 87 will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 88 */ 89 PetscErrorCode (*d_SyncDevice) (PetscSFLink); 90 PetscErrorCode (*d_SyncStream) (PetscSFLink); 91 92 PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 93 PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 94 PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 95 PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 96 PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 97 PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 98 PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 99 PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 100 PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 101 PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 102 PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 103 PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 104 PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 105 PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 106 PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 107 108 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 109 PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 110 PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 111 PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 112 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 113 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 114 PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 115 PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 116 PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 117 PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 118 PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 119 PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 120 PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 121 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 122 123 /* Packing routines using atomics when there are data race chances */ 124 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 125 PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 126 PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 127 PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 128 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 129 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 130 PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 131 PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 132 PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 133 PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 134 PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 135 PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 136 PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 137 PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 138 139 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 140 PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 141 PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 142 PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 143 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 144 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 145 PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 146 PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 147 PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 148 PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 149 PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 150 PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 151 PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 152 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 153 #if defined (PETSC_HAVE_CUDA) 154 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 155 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 156 #elif defined (PETSC_HAVE_HIP) 157 hipStream_t stream; 158 #endif 159 160 PetscErrorCode (*Destroy)(PetscSFLink); /* These two fields are meant to be used by SF_Kokkos, with spptr pointing to an execution space object */ 161 void *spptr; /* for a given stream, but unused now due to a Kokkos bug, so that SF_Kokkos only supports null stream. */ 162 #endif 163 164 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 165 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 166 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 167 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 168 size_t unitbytes; /* Number of bytes in a unit */ 169 PetscInt bs; /* Number of basic units in a unit */ 170 const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 171 PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 172 173 /* For local and remote communication */ 174 PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 175 PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 176 PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 177 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 178 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 179 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 180 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 181 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 182 char *leafbuf_alloc[2][2]; 183 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 184 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 185 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 186 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 187 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 188 PetscSFLink next; 189 }; 190 191 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 192 193 /* Create/setup/retrieve/destroy a link */ 194 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 195 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 196 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 197 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 198 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 199 200 /* Get pack/unpack function pointers from a link */ 201 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 202 { 203 PetscFunctionBegin; 204 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 205 #if defined(PETSC_HAVE_DEVICE) 206 else *Pack = link->d_Pack; 207 #endif 208 PetscFunctionReturn(0); 209 } 210 211 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 212 { 213 PetscErrorCode ierr; 214 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 215 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 216 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 217 218 PetscFunctionBegin; 219 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 220 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 221 PetscFunctionReturn(0); 222 } 223 224 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 225 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 226 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 227 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 228 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 229 230 /* Do Pack/Unpack/Fetch/Scatter with the link */ 231 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 232 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 233 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 234 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 235 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 236 237 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 238 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 239 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 240 241 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 242 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 243 244 #if defined(PETSC_HAVE_CUDA) 245 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF,PetscSFLink,MPI_Datatype); 246 #endif 247 248 #if defined(PETSC_HAVE_KOKKOS) 249 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype); 250 #endif 251 252 /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 253 #if defined(PETSC_HAVE_DEVICE) 254 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 255 make sure the data is ready for packing. 256 */ 257 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 258 { 259 PetscErrorCode ierr; 260 PetscFunctionBegin; 261 if (sf->use_default_stream) PetscFunctionReturn(0); 262 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = (*link->d_SyncDevice)(link);CHKERRQ(ierr);} 263 PetscFunctionReturn(0); 264 } 265 266 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 267 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 268 { 269 PetscErrorCode ierr; 270 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 271 272 PetscFunctionBegin; 273 /* Do nothing if we use stream aware mpi || has nothing for remote */ 274 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 275 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 276 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 277 PetscFunctionReturn(0); 278 } 279 280 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 281 { 282 PetscErrorCode ierr; 283 PetscFunctionBegin; 284 /* See comments above */ 285 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 286 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 287 PetscFunctionReturn(0); 288 } 289 290 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 291 does not know which stream the callers will use. 292 */ 293 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 294 { 295 PetscErrorCode ierr; 296 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 297 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 298 299 PetscFunctionBegin; 300 /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 301 if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 302 303 /* If rootmtype is HOST or DEVICE: 304 If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 305 If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 306 i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 307 308 Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 309 CPU thread might use the yet-to-be-updated rootdata pending in the stream. 310 */ 311 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 312 PetscFunctionReturn(0); 313 } 314 315 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 316 { 317 PetscErrorCode ierr; 318 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 319 320 PetscFunctionBegin; 321 /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 322 if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 323 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 324 PetscFunctionReturn(0); 325 } 326 327 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 328 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 329 */ 330 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 331 { 332 PetscErrorCode ierr; 333 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 334 335 PetscFunctionBegin; 336 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 337 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 338 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 339 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 340 if (device2host) { 341 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 342 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 343 } else { 344 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 345 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 346 } 347 } 348 PetscFunctionReturn(0); 349 } 350 351 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 352 { 353 PetscErrorCode ierr; 354 355 PetscFunctionBegin; 356 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 357 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 358 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 359 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 360 if (device2host) { 361 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 362 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 363 } else { 364 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 365 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 366 } 367 } 368 PetscFunctionReturn(0); 369 } 370 371 #else /* Host only */ 372 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 373 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 374 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 375 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 376 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 377 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 378 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 379 #endif 380 381 /* Get root indices used for pack/unpack 382 383 Input arguments: 384 +sf - StarForest 385 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 386 .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 387 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 388 389 Output arguments: 390 +count - Count of indices 391 .start - The first index (only useful when indices is NULL) 392 -indices - indices of roots for pack/unpack. NULL means indices are contiguous 393 */ 394 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 395 { 396 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 397 PetscInt offset; 398 399 PetscFunctionBegin; 400 *count = bas->rootbuflen[scope]; 401 *start = bas->rootstart[scope]; 402 *opt = NULL; 403 *indices = NULL; 404 405 /* We have these rules: 406 1) opt == NULL && indices == NULL ==> indices are contiguous. 407 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 408 want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 409 */ 410 if (!bas->rootcontig[scope]) { 411 offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 412 if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 413 #if defined(PETSC_HAVE_DEVICE) 414 else { 415 PetscErrorCode ierr; 416 size_t size; 417 if (bas->rootpackopt[scope]) { 418 if (!bas->rootpackopt_d[scope]) { 419 ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 420 ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 421 size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 422 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr); 423 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr); 424 } 425 *opt = bas->rootpackopt_d[scope]; 426 } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 427 if (!bas->irootloc_d[scope]) { 428 size = bas->rootbuflen[scope]*sizeof(PetscInt); 429 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr); 430 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr); 431 } 432 *indices = bas->irootloc_d[scope]; 433 } 434 } 435 #endif 436 } 437 PetscFunctionReturn(0); 438 } 439 440 /* Get leaf indices used for pack/unpack 441 442 See also PetscSFLinkGetRootPackOptAndIndices() 443 */ 444 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 445 { 446 PetscInt offset; 447 448 PetscFunctionBegin; 449 *count = sf->leafbuflen[scope]; 450 *start = sf->leafstart[scope]; 451 *opt = NULL; 452 *indices = NULL; 453 if (!sf->leafcontig[scope]) { 454 offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 455 if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 456 #if defined(PETSC_HAVE_DEVICE) 457 else { 458 PetscErrorCode ierr; 459 size_t size; 460 if (sf->leafpackopt[scope]) { 461 if (!sf->leafpackopt_d[scope]) { 462 ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 463 ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 464 size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 465 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */ 466 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr); 467 } 468 *opt = sf->leafpackopt_d[scope]; 469 } else { 470 if (!sf->rmine_d[scope]) { 471 size = sf->leafbuflen[scope]*sizeof(PetscInt); 472 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr); 473 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr); 474 } 475 *indices = sf->rmine_d[scope]; 476 } 477 } 478 #endif 479 } 480 PetscFunctionReturn(0); 481 } 482 #endif 483