1 #if !defined(__SFPACK_H) 2 #define __SFPACK_H 3 4 #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5 6 /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 7 typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 8 9 /* Optimization plans in packing & unpacking for destination ranks. 10 11 Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing: 12 p[i] = u[idx[i]], for i in [0,count) 13 14 Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[]. 15 We analyze indices for each rank and see if they are patterns that can be used to optimize the packing. 16 The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into 17 a small number of contiguous memory copies or one strided memory copy. 18 19 Note before using this per-rank optimization, one should check rleafloccontig, irootloccontig, which say 20 indices in whole start from 0 and are contiguous, and therefore much useful when true. 21 */ 22 typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType; 23 struct _n_PetscSFPackOpt { 24 PetscInt n; /* Number of destination ranks */ 25 PetscSFPackOptType *type; /* [n] Optimization types for the n ranks */ 26 PetscInt *offset; /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */ 27 PetscInt *copy_offset; /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */ 28 PetscInt *copy_start; /* [*] j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */ 29 PetscInt *copy_length; /* [*] starting at idx[copy_start[j]] */ 30 PetscInt *stride_step; /* [n] If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */ 31 PetscInt *stride_n; /* [n] and total stride_n[i] steps */ 32 }; 33 34 /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 35 */ 36 struct _n_PetscSFLink { /* link, count, start, indices, pack plan, root/leafdata, buffer*/ 37 PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*); 38 PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 39 PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 40 PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 41 PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 42 PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 43 PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 44 PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 45 PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 46 PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 47 PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 48 PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 49 PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 50 PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 51 PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 52 /* link, count, startx, idx, xdata, starty, idy, ydata */ 53 PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 54 PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 55 PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 56 PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 57 PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 58 PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 59 PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 60 PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 61 PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 62 PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 63 PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 64 PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 65 PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 66 /* link, count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate*/ 67 PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 68 69 PetscBool deviceinited; /* Are device related fields initialized? */ 70 #if defined(PETSC_HAVE_CUDA) 71 /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 72 will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using 73 regular ops when there are no data race chances. 74 */ 75 PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*); 76 77 PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 78 PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 79 PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 80 PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 81 PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 82 PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 83 PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 84 PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 85 PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 86 PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 87 PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 88 PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 89 PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 90 PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 91 92 PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 93 PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 94 PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 95 PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 96 PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 97 PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 98 PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 99 PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 100 PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 101 PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 102 PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 103 PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 104 PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 105 PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 106 107 /* Packing routines using atomics when there are data race chances */ 108 PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 109 PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 110 PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 111 PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 112 PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 113 PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 114 PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 115 PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 116 PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 117 PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 118 PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 119 PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 120 PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 121 PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 122 123 PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 124 PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 125 PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 126 PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 127 PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 128 PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 129 PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 130 PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 131 PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 132 PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 133 PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 134 PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 135 PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 136 PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 137 138 PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 139 cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 140 #endif 141 PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 142 MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 143 MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 144 PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 145 size_t unitbytes; /* Number of bytes in a unit */ 146 PetscInt bs; /* Number of basic units in a unit */ 147 const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 148 PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 149 150 /* For local and remote communication */ 151 PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 152 PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 153 PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 154 const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 155 const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 156 char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 157 char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 158 char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 159 char *leafbuf_alloc[2][2]; 160 MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 161 MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 162 PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 163 PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 164 MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 165 PetscSFLink next; 166 }; 167 168 #if defined(PETSC_USE_DEBUG) 169 PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 170 #else 171 #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0 172 #endif 173 174 /* Create/setup/retrieve/destroy a link */ 175 PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 176 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 177 #if defined(PETSC_HAVE_CUDA) 178 PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype); 179 #else 180 #define PetscSFLinkSetUp_Device(a,b,c) 0 181 #endif 182 PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 183 PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 184 PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 185 186 /* Get pack/unpack function pointers from a link */ 187 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*)) 188 { 189 PetscFunctionBegin; 190 if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 191 #if defined(PETSC_HAVE_CUDA) 192 else *Pack = link->d_Pack; 193 #endif 194 PetscFunctionReturn(0); 195 } 196 PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)); 197 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)); 198 PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)); 199 PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)); 200 PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 201 202 /* Do Pack/Unpack/Fetch/Scatter with the link */ 203 PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 204 PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 205 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 206 PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 207 PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 208 209 PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 210 PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 211 PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 212 213 PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf); 214 PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf); 215 PETSC_INTERN PetscErrorCode PetscSFCreatePackOpt(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*); 216 PETSC_INTERN PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out); 217 218 219 /* Get root indices used for pack/unpack 220 221 Input arguments: 222 +sf - StarForest 223 .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 224 .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 225 .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 226 227 Output arguments: 228 +count - Count of indices 229 .start - The first index (only useful when indices is NULL) 230 -indices - indices of roots for pack/unpack. NULL means indices are contiguous 231 */ 232 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices) 233 { 234 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 235 PetscInt offset; 236 237 PetscFunctionBegin; 238 if (count) *count = bas->rootbuflen[scope]; 239 if (start) *start = bas->rootstart[scope]; 240 if (indices) { 241 if (bas->rootcontig[scope]) *indices = NULL; 242 else { 243 offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 244 if (mtype == PETSC_MEMTYPE_HOST) {*indices = bas->irootloc + offset;} 245 #if defined(PETSC_HAVE_CUDA) 246 else { 247 if (!bas->irootloc_d[scope]) { 248 cudaError_t cerr; 249 size_t size = bas->rootbuflen[scope]*sizeof(PetscInt); 250 cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr); 251 cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 252 } 253 *indices = bas->irootloc_d[scope]; 254 } 255 #endif 256 } 257 } 258 PetscFunctionReturn(0); 259 } 260 261 /* Get leaf indices used for pack/unpack 262 263 See also PetscSFLinkGetRootIndices() 264 */ 265 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices) 266 { 267 PetscInt offset; 268 269 PetscFunctionBegin; 270 if (count) *count = sf->leafbuflen[scope]; 271 if (start) *start = sf->leafstart[scope]; 272 if (indices) { 273 if (sf->leafcontig[scope]) *indices = NULL; 274 else { 275 offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 276 if (mtype == PETSC_MEMTYPE_HOST) {*indices = sf->rmine + offset;} 277 #if defined(PETSC_HAVE_CUDA) 278 else { 279 if (!sf->rmine_d[scope]) { 280 cudaError_t cerr; 281 size_t size = sf->leafbuflen[scope]*sizeof(PetscInt); 282 cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr); 283 cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 284 } 285 *indices = sf->rmine_d[scope]; 286 } 287 #endif 288 } 289 } 290 PetscFunctionReturn(0); 291 } 292 293 PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 294 { 295 PetscErrorCode ierr; 296 PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 297 const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 298 const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 299 300 PetscFunctionBegin; 301 ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 302 ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 303 PetscFunctionReturn(0); 304 } 305 306 #endif 307