1 #pragma once 2 3 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 4 #if defined(PETSC_HAVE_CUDA) 5 #include <petscdevice_cuda.h> 6 typedef cudaStream_t cupmStream_t; 7 typedef cudaEvent_t cupmEvent_t; 8 #endif 9 10 #if defined(PETSC_HAVE_HIP) 11 #include <petscdevice_hip.h> 12 typedef hipStream_t cupmStream_t; 13 typedef hipEvent_t cupmEvent_t; 14 #endif 15 16 /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to. 17 We prefer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int. 18 */ 19 typedef long long int llint; 20 typedef unsigned long long int ullint; 21 22 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 23 typedef enum { 24 PETSCSF_LOCAL = 0, 25 PETSCSF_REMOTE 26 } PetscSFScope; 27 28 /* Optimizations in packing & unpacking for destination ranks. 29 30 Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 31 p[i] = u[idx[i]], for i in [0,m) 32 33 Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 34 We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 35 a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 36 37 E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 38 is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 39 40 Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 41 indices in whole are contiguous, and therefore much more useful than this one when true. 42 */ 43 struct _n_PetscSFPackOpt { 44 PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 45 PetscInt n; /* Number of destination ranks */ 46 PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 47 PetscInt *start; /* [n] First index */ 48 PetscInt *dx, *dy, *dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 49 PetscInt *X, *Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 50 }; 51 52 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 53 */ 54 struct _n_PetscSFLink { 55 PetscErrorCode (*Memcpy)(PetscSFLink, PetscMemType, void *, PetscMemType, const void *, size_t); /* Async device memcopy might use stream in the link */ 56 PetscErrorCode (*PrePack)(PetscSF, PetscSFLink, PetscSFDirection); 57 PetscErrorCode (*PostUnpack)(PetscSF, PetscSFLink, PetscSFDirection); 58 PetscErrorCode (*InitMPIRequests)(PetscSF, PetscSFLink, PetscSFDirection); // init (persistent) MPI requests 59 PetscErrorCode (*StartCommunication)(PetscSF, PetscSFLink, PetscSFDirection); 60 PetscErrorCode (*FinishCommunication)(PetscSF, PetscSFLink, PetscSFDirection); 61 PetscErrorCode (*SyncDevice)(PetscSFLink); 62 PetscErrorCode (*SyncStream)(PetscSFLink); 63 PetscErrorCode (*Destroy)(PetscSF, PetscSFLink); 64 65 PetscErrorCode (*BuildDependenceBegin)(PetscSF, PetscSFLink, PetscSFDirection); 66 PetscErrorCode (*BuildDependenceEnd)(PetscSF, PetscSFLink, PetscSFDirection); 67 68 PetscErrorCode (*h_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 69 PetscErrorCode (*h_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 70 PetscErrorCode (*h_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 71 PetscErrorCode (*h_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 72 PetscErrorCode (*h_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 73 PetscErrorCode (*h_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 74 PetscErrorCode (*h_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 75 PetscErrorCode (*h_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 76 PetscErrorCode (*h_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 77 PetscErrorCode (*h_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 78 PetscErrorCode (*h_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 79 PetscErrorCode (*h_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 80 PetscErrorCode (*h_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 81 PetscErrorCode (*h_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 82 PetscErrorCode (*h_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 83 84 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 85 PetscErrorCode (*h_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 86 PetscErrorCode (*h_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 87 PetscErrorCode (*h_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 88 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 89 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 90 PetscErrorCode (*h_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 91 PetscErrorCode (*h_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 92 PetscErrorCode (*h_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 93 PetscErrorCode (*h_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 94 PetscErrorCode (*h_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 95 PetscErrorCode (*h_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 96 PetscErrorCode (*h_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 97 98 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 99 100 PetscBool deviceinited; /* Are device related fields initialized? */ 101 #if defined(PETSC_HAVE_DEVICE) 102 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 103 will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 104 */ 105 PetscErrorCode (*d_Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 106 PetscErrorCode (*d_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 107 PetscErrorCode (*d_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 108 PetscErrorCode (*d_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 109 PetscErrorCode (*d_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 110 PetscErrorCode (*d_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 111 PetscErrorCode (*d_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 112 PetscErrorCode (*d_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 113 PetscErrorCode (*d_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 114 PetscErrorCode (*d_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 115 PetscErrorCode (*d_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 116 PetscErrorCode (*d_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 117 PetscErrorCode (*d_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 118 PetscErrorCode (*d_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 119 PetscErrorCode (*d_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 120 121 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 122 PetscErrorCode (*d_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 123 PetscErrorCode (*d_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 124 PetscErrorCode (*d_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 125 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 126 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 127 PetscErrorCode (*d_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 128 PetscErrorCode (*d_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 129 PetscErrorCode (*d_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 130 PetscErrorCode (*d_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 131 PetscErrorCode (*d_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 132 PetscErrorCode (*d_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 133 PetscErrorCode (*d_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 134 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 135 136 /* Packing routines using atomics when there are data race chances */ 137 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 138 PetscErrorCode (*da_UnpackAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 139 PetscErrorCode (*da_UnpackAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 140 PetscErrorCode (*da_UnpackAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 141 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 142 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 143 PetscErrorCode (*da_UnpackAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 144 PetscErrorCode (*da_UnpackAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 145 PetscErrorCode (*da_UnpackAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 146 PetscErrorCode (*da_UnpackAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 147 PetscErrorCode (*da_UnpackAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 148 PetscErrorCode (*da_UnpackAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 149 PetscErrorCode (*da_UnpackAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *); 150 PetscErrorCode (*da_FetchAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *); 151 152 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 153 PetscErrorCode (*da_ScatterAndAdd)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 154 PetscErrorCode (*da_ScatterAndMin)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 155 PetscErrorCode (*da_ScatterAndMax)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 156 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 157 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 158 PetscErrorCode (*da_ScatterAndMult)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 159 PetscErrorCode (*da_ScatterAndLAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 160 PetscErrorCode (*da_ScatterAndBAND)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 161 PetscErrorCode (*da_ScatterAndLOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 162 PetscErrorCode (*da_ScatterAndBOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 163 PetscErrorCode (*da_ScatterAndLXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 164 PetscErrorCode (*da_ScatterAndBXOR)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *); 165 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *); 166 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) 167 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 168 cupmStream_t stream; /* stream on which input/output root/leafdata is computed on (default is PetscDefaultCudaStream) */ 169 #endif 170 #endif 171 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 172 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 173 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 174 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 175 size_t unitbytes; /* Number of bytes in a unit */ 176 PetscInt bs; /* Number of basic units in a unit */ 177 const void *rootdata, *leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 178 PetscMemType rootmtype, leafmtype; /* root/leafdata's memory type */ 179 180 /* For local and remote communication */ 181 PetscMemType rootmtype_mpi, leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 182 PetscBool rootdirect[2], leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 183 PetscInt rootdirect_mpi, leafdirect_mpi; /* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 184 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 185 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 186 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE]. PETSCSF_LOCAL does not need MPI, .. */ 187 /* .. but in case rootmtype is different from leafmtype, we still need to pack local roots and then copy them to memory of leafmtype */ 188 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 189 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 190 char *leafbuf_alloc[2][2]; 191 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 192 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 193 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 194 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 195 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 196 PetscSFLink next; 197 198 PetscBool use_nvshmem; /* Does this link use nvshem (vs. MPI) for communication? */ 199 #if defined(PETSC_HAVE_NVSHMEM) 200 cupmEvent_t dataReady; /* Events to mark readiness of root/leafdata */ 201 cupmEvent_t endRemoteComm; /* Events to mark end of local/remote communication */ 202 cupmStream_t remoteCommStream; /* Streams for remote (i.e., inter-rank) communication */ 203 204 /* The buffers are allocated in device symmetric heap. Their length is the maximal length over all ranks in the comm, and therefore is the same. */ 205 uint64_t *rootSendSig, *rootRecvSig; /* [max{niranks-ndiranks}], signals used when rootbuf works as send/recv buf */ 206 uint64_t *leafSendSig, *leafRecvSig; /* [max{nranks-ndranks}], signals used when leafbuf works as send/recv buf */ 207 #endif 208 }; 209 210 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF, MPI_Datatype, const void *, const void *); 211 212 /* Create/setup/retrieve/destroy a link */ 213 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 214 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF, PetscSFLink, MPI_Datatype); 215 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF, MPI_Datatype, const void *, const void *, PetscCopyMode, PetscSFLink *); 216 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF, PetscSFLink *); 217 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF, PetscSFLink); 218 219 /* Get pack/unpack function pointers from a link */ 220 static inline PetscErrorCode PetscSFLinkGetPack(PetscSFLink link, PetscMemType mtype, PetscErrorCode (**Pack)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)) 221 { 222 PetscFunctionBegin; 223 if (PetscMemTypeHost(mtype)) *Pack = link->h_Pack; 224 #if defined(PETSC_HAVE_DEVICE) 225 else *Pack = link->d_Pack; 226 #endif 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**UnpackAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, const void *)); 231 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, void *)); 232 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**ScatterAndOp)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, PetscInt, PetscSFPackOpt, const PetscInt *, void *)); 233 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink, PetscMemType, MPI_Op, PetscBool, PetscErrorCode (**FetchAndOpLocal)(PetscSFLink, PetscInt, PetscInt, PetscSFPackOpt, const PetscInt *, void *, PetscInt, PetscSFPackOpt, const PetscInt *, const void *, void *)); 234 235 /* Do Pack/Unpack/Fetch/Scatter with the link */ 236 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData(PetscSF, PetscSFLink, PetscSFScope, const void *); 237 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData(PetscSF, PetscSFLink, PetscSFScope, const void *); 238 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op); 239 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF, PetscSFLink, PetscSFScope, void *, MPI_Op); 240 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF, PetscSFLink, void *, MPI_Op); 241 242 PETSC_INTERN PetscErrorCode PetscSFLinkScatterLocal(PetscSF, PetscSFLink, PetscSFDirection, void *, void *, MPI_Op); 243 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF, PetscSFLink, void *, const void *, void *, MPI_Op); 244 245 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 246 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 247 PETSC_INTERN PetscErrorCode PetscSFLinkCreate_MPI(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 248 249 #if defined(PETSC_HAVE_CUDA) 250 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_CUDA(PetscSF, PetscSFLink, MPI_Datatype); 251 #endif 252 253 #if defined(PETSC_HAVE_HIP) 254 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_HIP(PetscSF, PetscSFLink, MPI_Datatype); 255 #endif 256 257 #if defined(PETSC_HAVE_KOKKOS) 258 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF, PetscSFLink, MPI_Datatype); 259 #endif 260 261 #if defined(PETSC_HAVE_NVSHMEM) 262 PETSC_INTERN PetscErrorCode PetscSFLinkCreate_NVSHMEM(PetscSF, MPI_Datatype, PetscMemType, const void *, PetscMemType, const void *, MPI_Op, PetscSFOperation, PetscSFLink *); 263 PETSC_INTERN PetscErrorCode PetscSFLinkNvshmemCheck(PetscSF, PetscMemType, const void *, PetscMemType, const void *, PetscBool *); 264 #endif 265 266 static inline PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf, PetscSFLink link, PetscSFDirection direction, void **rootbuf, void **leafbuf, MPI_Request **rootreqs, MPI_Request **leafreqs) 267 { 268 const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi; /* memtype of buffers passed to MPI */ 269 const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi; 270 271 PetscFunctionBegin; 272 if (link->InitMPIRequests) PetscCall((*link->InitMPIRequests)(sf, link, direction)); // init (persistent) MPI requests 273 274 if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; 275 if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; 276 if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; 277 if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; 278 PetscFunctionReturn(PETSC_SUCCESS); 279 } 280 281 static inline PetscErrorCode PetscSFLinkStartCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 282 { 283 PetscFunctionBegin; 284 if (link->StartCommunication) PetscCall((*link->StartCommunication)(sf, link, direction)); 285 PetscFunctionReturn(PETSC_SUCCESS); 286 } 287 288 static inline PetscErrorCode PetscSFLinkFinishCommunication(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 289 { 290 PetscFunctionBegin; 291 if (link->FinishCommunication) PetscCall((*link->FinishCommunication)(sf, link, direction)); 292 PetscFunctionReturn(PETSC_SUCCESS); 293 } 294 295 /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 296 #if defined(PETSC_HAVE_CUDA) || defined(PETSC_HAVE_HIP) || defined(PETSC_HAVE_SYCL) 297 /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 298 to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 299 */ 300 static inline PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host) 301 { 302 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 303 304 PetscFunctionBegin; 305 /* rootdata is on device but we use regular MPI for communication */ 306 if (PetscMemTypeDevice(link->rootmtype) && PetscMemTypeHost(link->rootmtype_mpi) && bas->rootbuflen[PETSCSF_REMOTE]) { 307 void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 308 void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 309 size_t count = bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes; 310 if (device2host) { 311 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count)); 312 PetscCall(PetscLogGpuToCpu(count)); 313 } else { 314 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count)); 315 PetscCall(PetscLogCpuToGpu(count)); 316 } 317 } 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 static inline PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf, PetscSFLink link, PetscBool device2host) 322 { 323 PetscFunctionBegin; 324 if (PetscMemTypeDevice(link->leafmtype) && PetscMemTypeHost(link->leafmtype_mpi) && sf->leafbuflen[PETSCSF_REMOTE]) { 325 void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 326 void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 327 size_t count = sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes; 328 if (device2host) { 329 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_HOST, h_buf, PETSC_MEMTYPE_DEVICE, d_buf, count)); 330 PetscCall(PetscLogGpuToCpu(count)); 331 } else { 332 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, d_buf, PETSC_MEMTYPE_HOST, h_buf, count)); 333 PetscCall(PetscLogCpuToGpu(count)); 334 } 335 } 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 /* Make sure root/leafbuf for the remote is ready for MPI */ 340 static inline PetscErrorCode PetscSFLinkSyncStreamBeforeCallMPI(PetscSF sf, PetscSFLink link, PetscSFDirection direction) 341 { 342 PetscSF_Basic *bas; 343 PetscInt buflen; 344 PetscMemType mtype; 345 346 PetscFunctionBegin; 347 if (direction == PETSCSF_ROOT2LEAF) { 348 bas = (PetscSF_Basic *)sf->data; 349 mtype = link->rootmtype; 350 buflen = bas->rootbuflen[PETSCSF_REMOTE]; 351 } else { 352 mtype = link->leafmtype; 353 buflen = sf->leafbuflen[PETSCSF_REMOTE]; 354 } 355 356 if (PetscMemTypeDevice(mtype) && buflen) PetscCall((*link->SyncStream)(link)); 357 PetscFunctionReturn(PETSC_SUCCESS); 358 } 359 #else /* Host only */ 360 #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS 361 #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a, b, c) PETSC_SUCCESS 362 #define PetscSFLinkSyncStreamBeforeCallMPI(a, b, c) PETSC_SUCCESS 363 #endif 364 365 /* Get root indices used for pack/unpack 366 367 Input arguments: 368 +sf - StarForest 369 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 370 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 371 -scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 372 373 Output arguments: 374 +count - Count of indices 375 .start - The first index (only useful when indices is NULL) 376 .opt - Packing optimizations 377 -indices - Indices of roots for pack/unpack. NULL means indices are contiguous 378 */ 379 static inline PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices) 380 { 381 PetscSF_Basic *bas = (PetscSF_Basic *)sf->data; 382 PetscInt offset; 383 384 PetscFunctionBegin; 385 *count = bas->rootbuflen[scope]; 386 *start = bas->rootstart[scope]; 387 *opt = NULL; 388 *indices = NULL; 389 390 /* We have these rules: 391 1) opt == NULL && indices == NULL ==> indices are contiguous. 392 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 393 want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 394 */ 395 if (!bas->rootcontig[scope]) { 396 offset = (scope == PETSCSF_LOCAL) ? 0 : bas->ioffset[bas->ndiranks]; 397 if (PetscMemTypeHost(mtype)) { 398 *opt = bas->rootpackopt[scope]; 399 *indices = bas->irootloc + offset; 400 } else { 401 size_t size; 402 if (bas->rootpackopt[scope]) { 403 if (!bas->rootpackopt_d[scope]) { 404 PetscCall(PetscMalloc1(1, &bas->rootpackopt_d[scope])); 405 PetscCall(PetscArraycpy(bas->rootpackopt_d[scope], bas->rootpackopt[scope], 1)); /* Make pointers in bas->rootpackopt_d[] still work on host */ 406 size = (bas->rootpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 407 PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->rootpackopt_d[scope]->array)); 408 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->rootpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, bas->rootpackopt[scope]->array, size)); 409 } 410 *opt = bas->rootpackopt_d[scope]; 411 } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 412 if (!bas->irootloc_d[scope]) { 413 size = bas->rootbuflen[scope] * sizeof(PetscInt); 414 PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&bas->irootloc_d[scope])); 415 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, bas->irootloc_d[scope], PETSC_MEMTYPE_HOST, bas->irootloc + offset, size)); 416 } 417 *indices = bas->irootloc_d[scope]; 418 } 419 } 420 } 421 PetscFunctionReturn(PETSC_SUCCESS); 422 } 423 424 /* Get leaf indices used for pack/unpack 425 426 See also PetscSFLinkGetRootPackOptAndIndices() 427 */ 428 static inline PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf, PetscSFLink link, PetscMemType mtype, PetscSFScope scope, PetscInt *count, PetscInt *start, PetscSFPackOpt *opt, const PetscInt **indices) 429 { 430 PetscInt offset; 431 432 PetscFunctionBegin; 433 *count = sf->leafbuflen[scope]; 434 *start = sf->leafstart[scope]; 435 *opt = NULL; 436 *indices = NULL; 437 if (!sf->leafcontig[scope]) { 438 offset = (scope == PETSCSF_LOCAL) ? 0 : sf->roffset[sf->ndranks]; 439 if (PetscMemTypeHost(mtype)) { 440 *opt = sf->leafpackopt[scope]; 441 *indices = sf->rmine + offset; 442 } else { 443 size_t size; 444 if (sf->leafpackopt[scope]) { 445 if (!sf->leafpackopt_d[scope]) { 446 PetscCall(PetscMalloc1(1, &sf->leafpackopt_d[scope])); 447 PetscCall(PetscArraycpy(sf->leafpackopt_d[scope], sf->leafpackopt[scope], 1)); 448 size = (sf->leafpackopt[scope]->n * 7 + 2) * sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 449 PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->leafpackopt_d[scope]->array)); /* Change ->array to a device pointer */ 450 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->leafpackopt_d[scope]->array, PETSC_MEMTYPE_HOST, sf->leafpackopt[scope]->array, size)); 451 } 452 *opt = sf->leafpackopt_d[scope]; 453 } else { 454 if (!sf->rmine_d[scope]) { 455 size = sf->leafbuflen[scope] * sizeof(PetscInt); 456 PetscCall(PetscSFMalloc(sf, PETSC_MEMTYPE_DEVICE, size, (void **)&sf->rmine_d[scope])); 457 PetscCall((*link->Memcpy)(link, PETSC_MEMTYPE_DEVICE, sf->rmine_d[scope], PETSC_MEMTYPE_HOST, sf->rmine + offset, size)); 458 } 459 *indices = sf->rmine_d[scope]; 460 } 461 } 462 } 463 PetscFunctionReturn(PETSC_SUCCESS); 464 } 465