1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5 #if defined(PETSC_HAVE_CUDA) 6 #include <cuda_runtime.h> /* For cudaStream_t */ 7 #include <petsccublas.h> /* For CHKERRCUDA */ 8 #endif 9 10 #if defined(PETSC_HAVE_HIP) 11 #include <hip/hip_runtime.h> /* For hipStream_t */ 12 #include <petschipblas.h> /* For CHKERRHIP */ 13 #endif 14 15 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to. 16 We perfer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int. 17 */ 18 typedef long long int llint; 19 typedef unsigned long long int ullint; 20 21 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 22 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 23 24 /* Optimizations in packing & unpacking for destination ranks. 25 26 Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 27 p[i] = u[idx[i]], for i in [0,m) 28 29 Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 30 We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 31 a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 32 33 E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 34 is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 35 36 Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 37 indices in whole are contiguous, and therefore much more useful than this one when true. 38 */ 39 struct _n_PetscSFPackOpt { 40 PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 41 PetscInt n; /* Number of destination ranks */ 42 PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 43 PetscInt *start; /* [n] First index */ 44 PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 45 PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 46 }; 47 48 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 49 */ 50 struct _n_PetscSFLink { 51 PetscErrorCode (*Memcpy) (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Asynchronous copy might use stream in the link */ 52 53 PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 54 PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 55 PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 56 PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 57 PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 58 PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 59 PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 60 PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 61 PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 62 PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 63 PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 64 PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 65 PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 66 PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 67 PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 68 69 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 70 PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 71 PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 72 PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 73 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 74 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 75 PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 76 PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 77 PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 78 PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 79 PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 80 PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 81 PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 82 83 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 84 85 PetscBool deviceinited; /* Are device related fields initialized? */ 86 #if defined(PETSC_HAVE_DEVICE) 87 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 88 will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 89 */ 90 PetscErrorCode (*d_SyncDevice) (PetscSFLink); 91 PetscErrorCode (*d_SyncStream) (PetscSFLink); 92 93 PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 94 PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 95 PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 96 PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 97 PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 98 PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 99 PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 100 PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 101 PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 102 PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 103 PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 104 PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 105 PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 106 PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 107 PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 108 109 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 110 PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 111 PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 112 PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 113 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 114 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 115 PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 116 PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 117 PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 118 PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 119 PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 120 PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 121 PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 122 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 123 124 /* Packing routines using atomics when there are data race chances */ 125 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 126 PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 127 PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 128 PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 129 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 130 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 131 PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 132 PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 133 PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 134 PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 135 PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 136 PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 137 PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 138 PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 139 140 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 141 PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 142 PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 143 PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 144 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 145 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 146 PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 147 PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 148 PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 149 PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 150 PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 151 PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 152 PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 153 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 154 /* TODO: Make runtime */ 155 #if defined (PETSC_HAVE_CUDA) 156 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 157 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 158 #elif defined (PETSC_HAVE_HIP) 159 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 160 hipStream_t stream; 161 #endif 162 163 PetscErrorCode (*Destroy)(PetscSFLink); /* These two fields are meant to be used by SF_Kokkos, with spptr pointing to an execution space object */ 164 void *spptr; /* for a given stream, but unused now due to a Kokkos bug, so that SF_Kokkos only supports null stream. */ 165 #endif 166 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 167 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 168 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 169 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 170 size_t unitbytes; /* Number of bytes in a unit */ 171 PetscInt bs; /* Number of basic units in a unit */ 172 const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 173 PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 174 175 /* For local and remote communication */ 176 PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 177 PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 178 PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 179 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 180 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 181 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 182 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 183 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 184 char *leafbuf_alloc[2][2]; 185 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 186 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 187 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 188 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 189 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 190 PetscSFLink next; 191 }; 192 193 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 194 195 /* Create/setup/retrieve/destroy a link */ 196 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 197 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 198 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 199 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 200 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 201 202 /* Get pack/unpack function pointers from a link */ 203 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 204 { 205 PetscFunctionBegin; 206 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 207 #if defined(PETSC_HAVE_DEVICE) 208 else *Pack = link->d_Pack; 209 #endif 210 PetscFunctionReturn(0); 211 } 212 213 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 214 { 215 PetscErrorCode ierr; 216 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 217 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 218 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 219 220 PetscFunctionBegin; 221 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 222 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 227 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 228 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 229 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 230 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 231 232 /* Do Pack/Unpack/Fetch/Scatter with the link */ 233 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 234 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 235 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 236 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 237 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 238 239 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 240 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 241 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 242 243 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 244 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 245 246 #if defined(PETSC_HAVE_CUDA) 247 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF,PetscSFLink,MPI_Datatype); 248 #endif 249 250 #if defined(PETSC_HAVE_HIP) 251 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Hip(PetscSF,PetscSFLink,MPI_Datatype); 252 #endif 253 254 #if defined(PETSC_HAVE_KOKKOS) 255 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype); 256 #endif 257 258 /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 259 #if defined(PETSC_HAVE_DEVICE) 260 /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 261 make sure the data is ready for packing. 262 */ 263 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 264 { 265 PetscErrorCode ierr; 266 PetscFunctionBegin; 267 if (sf->use_default_stream) PetscFunctionReturn(0); 268 if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = (*link->d_SyncDevice)(link);CHKERRQ(ierr);} 269 PetscFunctionReturn(0); 270 } 271 272 /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 273 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 274 { 275 PetscErrorCode ierr; 276 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 277 278 PetscFunctionBegin; 279 /* Do nothing if we use stream aware mpi || has nothing for remote */ 280 if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 281 /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 282 if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 283 PetscFunctionReturn(0); 284 } 285 286 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 287 { 288 PetscErrorCode ierr; 289 PetscFunctionBegin; 290 /* See comments above */ 291 if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 292 if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 293 PetscFunctionReturn(0); 294 } 295 296 /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 297 does not know which stream the callers will use. 298 */ 299 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 300 { 301 PetscErrorCode ierr; 302 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 303 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 304 305 PetscFunctionBegin; 306 /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 307 if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 308 309 /* If rootmtype is HOST or DEVICE: 310 If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 311 If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 312 i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 313 314 Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 315 CPU thread might use the yet-to-be-updated rootdata pending in the stream. 316 */ 317 if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 318 PetscFunctionReturn(0); 319 } 320 321 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 322 { 323 PetscErrorCode ierr; 324 PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 325 326 PetscFunctionBegin; 327 /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 328 if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 329 if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 330 PetscFunctionReturn(0); 331 } 332 333 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 334 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 335 */ 336 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 337 { 338 PetscErrorCode ierr; 339 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 340 341 PetscFunctionBegin; 342 if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 343 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 344 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 345 size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 346 if (device2host) { 347 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 348 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 349 } else { 350 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 351 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 352 } 353 } 354 PetscFunctionReturn(0); 355 } 356 357 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 363 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 364 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 365 size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 366 if (device2host) { 367 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 368 ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 369 } else { 370 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 371 ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 372 } 373 } 374 PetscFunctionReturn(0); 375 } 376 377 #else /* Host only */ 378 #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 379 #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 380 #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 381 #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 382 #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 383 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 384 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 385 #endif 386 387 /* Get root indices used for pack/unpack 388 389 Input arguments: 390 +sf - StarForest 391 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 392 .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 393 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 394 395 Output arguments: 396 +count - Count of indices 397 .start - The first index (only useful when indices is NULL) 398 -indices - indices of roots for pack/unpack. NULL means indices are contiguous 399 */ 400 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 401 { 402 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 403 PetscInt offset; 404 405 PetscFunctionBegin; 406 *count = bas->rootbuflen[scope]; 407 *start = bas->rootstart[scope]; 408 *opt = NULL; 409 *indices = NULL; 410 411 /* We have these rules: 412 1) opt == NULL && indices == NULL ==> indices are contiguous. 413 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 414 want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 415 */ 416 if (!bas->rootcontig[scope]) { 417 offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 418 if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 419 #if defined(PETSC_HAVE_DEVICE) 420 else { 421 PetscErrorCode ierr; 422 size_t size; 423 if (bas->rootpackopt[scope]) { 424 if (!bas->rootpackopt_d[scope]) { 425 ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 426 ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 427 size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 428 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr); 429 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr); 430 } 431 *opt = bas->rootpackopt_d[scope]; 432 } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 433 if (!bas->irootloc_d[scope]) { 434 size = bas->rootbuflen[scope]*sizeof(PetscInt); 435 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr); 436 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr); 437 } 438 *indices = bas->irootloc_d[scope]; 439 } 440 } 441 #endif 442 } 443 PetscFunctionReturn(0); 444 } 445 446 /* Get leaf indices used for pack/unpack 447 448 See also PetscSFLinkGetRootPackOptAndIndices() 449 */ 450 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 451 { 452 PetscInt offset; 453 454 PetscFunctionBegin; 455 *count = sf->leafbuflen[scope]; 456 *start = sf->leafstart[scope]; 457 *opt = NULL; 458 *indices = NULL; 459 if (!sf->leafcontig[scope]) { 460 offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 461 if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 462 #if defined(PETSC_HAVE_DEVICE) 463 else { 464 PetscErrorCode ierr; 465 size_t size; 466 if (sf->leafpackopt[scope]) { 467 if (!sf->leafpackopt_d[scope]) { 468 ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 469 ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 470 size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 471 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */ 472 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr); 473 } 474 *opt = sf->leafpackopt_d[scope]; 475 } else { 476 if (!sf->rmine_d[scope]) { 477 size = sf->leafbuflen[scope]*sizeof(PetscInt); 478 ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr); 479 ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr); 480 } 481 *indices = sf->rmine_d[scope]; 482 } 483 } 484 #endif 485 } 486 PetscFunctionReturn(0); 487 } 488 #endif 489