#include "petsc/private/sfimpl.h" #include <../src/vec/is/sf/impls/basic/sfpack.h> #include <../src/vec/is/sf/impls/basic/sfbasic.h> /* This is a C file that contains packing facilities, with dispatches to device if enabled. */ #if defined(PETSC_HAVE_CUDA) #include #include #endif #if defined(PETSC_HAVE_HIP) #include #include #endif /* * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction", * therefore we pack data types manually. This file defines packing routines for the standard data types. */ #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d /* Operations working like s += t */ #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */ #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */ #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */ #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0) /* Ref MPI MAXLOC */ #define OP_XLOC(op,s,t) \ do { \ if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \ else if (!((s).u op (t).u)) s = t; \ } while (0) /* DEF_PackFunc - macro defining a Pack routine Arguments of the macro: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry. .BS Block size for vectorization. It is a factor of bsz. -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below. Arguments of the Pack routine: +count Number of indices in idx[]. .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used. .opt Per-pack optimization plan. NULL means no such plan. .idx Indices of entries to packed. .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int. .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count). -packed Address of the packed data. */ #define DEF_PackFunc(Type,BS,EQ) \ static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \ { \ PetscErrorCode ierr; \ const Type *u = (const Type*)unpacked,*u2; \ Type *p = (Type*)packed,*p2; \ PetscInt i,j,k,X,Y,r,bs=link->bs; \ const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ PetscFunctionBegin; \ if (!idx) {ierr = PetscArraycpy(p,u+start*MBS,MBS*count);CHKERRQ(ierr);}/* idx[] are contiguous */ \ else if (opt) { /* has optimizations available */ \ p2 = p; \ for (r=0; rn; r++) { \ u2 = u + opt->start[r]*MBS; \ X = opt->X[r]; \ Y = opt->Y[r]; \ for (k=0; kdz[r]; k++) \ for (j=0; jdy[r]; j++) { \ ierr = PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS);CHKERRQ(ierr); \ p2 += opt->dx[r]*MBS; \ } \ } \ } else { \ for (i=0; ibs; \ const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ PetscFunctionBegin; \ if (!idx) { \ u += start*MBS; \ if (u != p) {ierr = PetscArraycpy(u,p,count*MBS);CHKERRQ(ierr);} \ } else if (opt) { /* has optimizations available */ \ for (r=0; rn; r++) { \ u2 = u + opt->start[r]*MBS; \ X = opt->X[r]; \ Y = opt->Y[r]; \ for (k=0; kdz[r]; k++) \ for (j=0; jdy[r]; j++) { \ ierr = PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS);CHKERRQ(ierr); \ p += opt->dx[r]*MBS; \ } \ } \ } else { \ for (i=0; ibs; \ const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \ const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \ PetscFunctionBegin; \ if (!idx) { \ u += start*MBS; \ for (i=0; in; r++) { \ u2 = u + opt->start[r]*MBS; \ X = opt->X[r]; \ Y = opt->Y[r]; \ for (k=0; kdz[r]; k++) \ for (j=0; jdy[r]; j++) { \ for (i=0; idx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \ p += opt->dx[r]*MBS; \ } \ } \ } else { \ for (i=0; ibs; \ const PetscInt M = (EQ) ? 1 : bs/BS; \ const PetscInt MBS = M*BS; \ PetscFunctionBegin; \ for (i=0; ibs; \ const PetscInt M = (EQ) ? 1 : bs/BS; \ const PetscInt MBS = M*BS; \ PetscFunctionBegin; \ if (!srcIdx) { /* src is contiguous */ \ u += srcStart*MBS; \ ierr = CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u);CHKERRQ(ierr); \ } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \ u += srcOpt->start[0]*MBS; \ v += dstStart*MBS; \ X = srcOpt->X[0]; Y = srcOpt->Y[0]; \ for (k=0; kdz[0]; k++) \ for (j=0; jdy[0]; j++) { \ for (i=0; idx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \ v += srcOpt->dx[0]*MBS; \ } \ } else { /* all other cases */ \ for (i=0; ibs; \ const PetscInt M = (EQ) ? 1 : bs/BS; \ const PetscInt MBS = M*BS; \ PetscFunctionBegin; \ for (i=0; ih_Pack = CPPJoin4(Pack, Type,BS,EQ); \ link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \ link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \ } /* Add, Mult ops */ #define DEF_Add(Type,BS,EQ) \ DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \ DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \ DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \ static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \ link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \ link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \ link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \ link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \ link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \ link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \ } /* Max, Min ops */ #define DEF_Cmp(Type,BS,EQ) \ DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \ DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \ static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \ link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \ link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \ } /* Logical ops. The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid the compilation warning "empty macro arguments are undefined in ISO C90" */ #define DEF_Log(Type,BS,EQ) \ DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \ DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \ DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \ DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \ DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \ DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \ static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \ link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \ link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \ link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \ link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \ link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \ link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \ } /* Bitwise ops */ #define DEF_Bit(Type,BS,EQ) \ DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \ DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \ DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \ DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \ DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \ DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \ static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \ link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \ link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \ link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \ link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \ link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \ link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \ } /* Maxloc, Minloc ops */ #define DEF_Xloc(Type,BS,EQ) \ DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \ DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \ DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \ DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \ static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \ link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \ link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \ link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \ link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \ } #define DEF_IntegerType(Type,BS,EQ) \ DEF_Pack(Type,BS,EQ) \ DEF_Add(Type,BS,EQ) \ DEF_Cmp(Type,BS,EQ) \ DEF_Log(Type,BS,EQ) \ DEF_Bit(Type,BS,EQ) \ static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \ CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \ } #define DEF_RealType(Type,BS,EQ) \ DEF_Pack(Type,BS,EQ) \ DEF_Add(Type,BS,EQ) \ DEF_Cmp(Type,BS,EQ) \ static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \ CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \ } #if defined(PETSC_HAVE_COMPLEX) #define DEF_ComplexType(Type,BS,EQ) \ DEF_Pack(Type,BS,EQ) \ DEF_Add(Type,BS,EQ) \ static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \ CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \ } #endif #define DEF_DumbType(Type,BS,EQ) \ DEF_Pack(Type,BS,EQ) \ static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \ CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ } /* Maxloc, Minloc */ #define DEF_PairType(Type,BS,EQ) \ DEF_Pack(Type,BS,EQ) \ DEF_Xloc(Type,BS,EQ) \ static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \ CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \ CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \ } DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */ DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */ DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */ DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */ DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */ DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */ DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */ DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */ #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */ DEF_IntegerType(int,1,1) DEF_IntegerType(int,2,1) DEF_IntegerType(int,4,1) DEF_IntegerType(int,8,1) DEF_IntegerType(int,1,0) DEF_IntegerType(int,2,0) DEF_IntegerType(int,4,0) DEF_IntegerType(int,8,0) #endif /* The typedefs are used to get a typename without space that CPPJoin can handle */ typedef signed char SignedChar; DEF_IntegerType(SignedChar,1,1) DEF_IntegerType(SignedChar,2,1) DEF_IntegerType(SignedChar,4,1) DEF_IntegerType(SignedChar,8,1) DEF_IntegerType(SignedChar,1,0) DEF_IntegerType(SignedChar,2,0) DEF_IntegerType(SignedChar,4,0) DEF_IntegerType(SignedChar,8,0) typedef unsigned char UnsignedChar; DEF_IntegerType(UnsignedChar,1,1) DEF_IntegerType(UnsignedChar,2,1) DEF_IntegerType(UnsignedChar,4,1) DEF_IntegerType(UnsignedChar,8,1) DEF_IntegerType(UnsignedChar,1,0) DEF_IntegerType(UnsignedChar,2,0) DEF_IntegerType(UnsignedChar,4,0) DEF_IntegerType(UnsignedChar,8,0) DEF_RealType(PetscReal,1,1) DEF_RealType(PetscReal,2,1) DEF_RealType(PetscReal,4,1) DEF_RealType(PetscReal,8,1) DEF_RealType(PetscReal,1,0) DEF_RealType(PetscReal,2,0) DEF_RealType(PetscReal,4,0) DEF_RealType(PetscReal,8,0) #if defined(PETSC_HAVE_COMPLEX) DEF_ComplexType(PetscComplex,1,1) DEF_ComplexType(PetscComplex,2,1) DEF_ComplexType(PetscComplex,4,1) DEF_ComplexType(PetscComplex,8,1) DEF_ComplexType(PetscComplex,1,0) DEF_ComplexType(PetscComplex,2,0) DEF_ComplexType(PetscComplex,4,0) DEF_ComplexType(PetscComplex,8,0) #endif #define PairType(Type1,Type2) Type1##_##Type2 typedef struct {int u; int i;} PairType(int,int); typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt); DEF_PairType(PairType(int,int),1,1) DEF_PairType(PairType(PetscInt,PetscInt),1,1) /* If we don't know the basic type, we treat it as a stream of chars or ints */ DEF_DumbType(char,1,1) DEF_DumbType(char,2,1) DEF_DumbType(char,4,1) DEF_DumbType(char,1,0) DEF_DumbType(char,2,0) DEF_DumbType(char,4,0) typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */ DEF_DumbType(DumbInt,1,1) DEF_DumbType(DumbInt,2,1) DEF_DumbType(DumbInt,4,1) DEF_DumbType(DumbInt,8,1) DEF_DumbType(DumbInt,1,0) DEF_DumbType(DumbInt,2,0) DEF_DumbType(DumbInt,4,0) DEF_DumbType(DumbInt,8,0) #if !defined(PETSC_HAVE_MPI_TYPE_DUP) PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype) { int ierr; ierr = MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr; ierr = MPI_Type_commit(newtype); if (ierr) return ierr; return MPI_SUCCESS; } #endif /* The routine Creates a communication link for the given operation. It first looks up its link cache. If there is a free & suitable one, it uses it. Otherwise it creates a new one. A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation. The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU. In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link. The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and need pack/unpack data. */ PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType xrootmtype,const void *rootdata,PetscMemType xleafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink) { PetscErrorCode ierr; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscInt i,j,k,nrootreqs,nleafreqs,nreqs; PetscSFLink *p,link; PetscSFDirection direction; MPI_Request *reqs = NULL; PetscBool match,rootdirect[2],leafdirect[2]; PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1*/ PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */ PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/ PetscFunctionBegin; ierr = PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);CHKERRQ(ierr); /* Can we directly use root/leafdirect with the given sf, sfop and op? */ for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { if (sfop == PETSCSF_BCAST) { rootdirect[i] = bas->rootcontig[i]; /* Pack roots */ leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */ } else if (sfop == PETSCSF_REDUCE) { leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */ rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */ } else { /* PETSCSF_FETCH */ rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */ leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */ } } if (sf->use_gpu_aware_mpi) { rootmtype_mpi = rootmtype; leafmtype_mpi = leafmtype; } else { rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST; } /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */ rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0; leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0; direction = (sfop == PETSCSF_BCAST)? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT; nrootreqs = bas->nrootreqs; nleafreqs = sf->nleafreqs; /* Look for free links in cache */ for (p=&bas->avail; (link=*p); p=&link->next) { ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); if (match) { /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current. If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests(). */ if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) { reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */ for (i=0; irootreqsinited[direction][rootmtype][1] = PETSC_FALSE; } if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) { reqs = link->leafreqs[direction][leafmtype][1]; for (i=0; ileafreqsinited[direction][leafmtype][1] = PETSC_FALSE; } *p = link->next; /* Remove from available list */ goto found; } } ierr = PetscNew(&link);CHKERRQ(ierr); ierr = PetscSFLinkSetUp_Host(sf,link,unit);CHKERRQ(ierr); ierr = PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag);CHKERRQ(ierr); /* One tag per link */ nreqs = (nrootreqs+nleafreqs)*8; ierr = PetscMalloc1(nreqs,&link->reqs);CHKERRQ(ierr); for (i=0; ireqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */ for (i=0; i<2; i++) { /* Two communication directions */ for (j=0; j<2; j++) { /* Two memory types */ for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */ link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k); link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k); } } } found: #if defined(PETSC_HAVE_DEVICE) if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) { #if defined(PETSC_HAVE_CUDA) if (sf->backend == PETSCSF_BACKEND_CUDA) {ierr = PetscSFLinkSetUp_Cuda(sf,link,unit);CHKERRQ(ierr);} #endif #if defined(PETSC_HAVE_HIP) if (sf->backend == PETSCSF_BACKEND_HIP) {ierr = PetscSFLinkSetUp_Hip(sf,link,unit);CHKERRQ(ierr);} #endif #if defined(PETSC_HAVE_KOKKOS) if (sf->backend == PETSCSF_BACKEND_KOKKOS) {ierr = PetscSFLinkSetUp_Kokkos(sf,link,unit);CHKERRQ(ierr);} #endif } #endif /* Allocate buffers along root/leafdata */ for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { /* For local communication, buffers are only needed when roots and leaves have different mtypes */ if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue; if (bas->rootbuflen[i]) { if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */ link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes; } else { /* Have to have a separate rootbuf */ if (!link->rootbuf_alloc[i][rootmtype]) { ierr = PetscSFMalloc(sf,rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);CHKERRQ(ierr); } link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype]; } } if (sf->leafbuflen[i]) { if (leafdirect[i]) { link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes; } else { if (!link->leafbuf_alloc[i][leafmtype]) { ierr = PetscSFMalloc(sf,leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);CHKERRQ(ierr); } link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype]; } } } #if defined(PETSC_HAVE_DEVICE) /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */ if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) { if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { ierr = PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); } link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; } if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) { if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) { ierr = PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); } link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; } #endif /* Set `current` state of the link. They may change between different SF invocations with the same link */ if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */ if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata; if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata; } link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */ link->leafdata = leafdata; for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { link->rootdirect[i] = rootdirect[i]; link->leafdirect[i] = leafdirect[i]; } link->rootdirect_mpi = rootdirect_mpi; link->leafdirect_mpi = leafdirect_mpi; link->rootmtype = rootmtype; link->leafmtype = leafmtype; link->rootmtype_mpi = rootmtype_mpi; link->leafmtype_mpi = leafmtype_mpi; link->next = bas->inuse; bas->inuse = link; *mylink = link; PetscFunctionReturn(0); } /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction. If the sf uses persistent requests and the requests have not been initialized, then initialize them. */ PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs) { PetscErrorCode ierr; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks; const PetscInt *rootoffset,*leafoffset; PetscMPIInt n; MPI_Aint disp; MPI_Comm comm = PetscObjectComm((PetscObject)sf); MPI_Datatype unit = link->unit; const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */ const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; PetscFunctionBegin; /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */ if (sf->persistent) { if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) { ierr = PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);CHKERRQ(ierr); if (direction == PETSCSF_LEAF2ROOT) { for (i=ndrootranks,j=0; iunitbytes; ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); ierr = MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); } } else { /* PETSCSF_ROOT2LEAF */ for (i=ndrootranks,j=0; iunitbytes; ierr = PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);CHKERRQ(ierr); ierr = MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);CHKERRMPI(ierr); } } link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE; } if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) { ierr = PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);CHKERRQ(ierr); if (direction == PETSCSF_LEAF2ROOT) { for (i=ndleafranks,j=0; iunitbytes; ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); ierr = MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); } } else { /* PETSCSF_ROOT2LEAF */ for (i=ndleafranks,j=0; iunitbytes; ierr = PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);CHKERRQ(ierr); ierr = MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);CHKERRMPI(ierr); } } link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE; } } if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]; if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]; if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]; if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]; PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink) { PetscErrorCode ierr; PetscSFLink link,*p; PetscSF_Basic *bas=(PetscSF_Basic*)sf->data; PetscFunctionBegin; /* Look for types in cache */ for (p=&bas->inuse; (link=*p); p=&link->next) { PetscBool match; ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) { switch (cmode) { case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */ case PETSC_USE_POINTER: break; default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode"); } *mylink = link; PetscFunctionReturn(0); } } SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack"); } PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link) { PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscFunctionBegin; (*link)->rootdata = NULL; (*link)->leafdata = NULL; (*link)->next = bas->avail; bas->avail = *link; *link = NULL; PetscFunctionReturn(0); } /* Destroy all links chained in 'avail' */ PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail) { PetscErrorCode ierr; PetscSFLink link = *avail,next; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8; PetscFunctionBegin; for (; link; link=next) { next = link->next; if (!link->isbuiltin) {ierr = MPI_Type_free(&link->unit);CHKERRMPI(ierr);} for (i=0; ireqs[i] != MPI_REQUEST_NULL) {ierr = MPI_Request_free(&link->reqs[i]);CHKERRMPI(ierr);} } ierr = PetscFree(link->reqs);CHKERRQ(ierr); for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { ierr = PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); ierr = PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);CHKERRQ(ierr); #if defined(PETSC_HAVE_DEVICE) ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); ierr = PetscSFFree(sf,PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);CHKERRQ(ierr); #endif } #if defined(PETSC_HAVE_DEVICE) if (link->Destroy) {ierr = (*link->Destroy)(link);CHKERRQ(ierr);} /*TODO: Make runtime */ #if defined(PETSC_HAVE_CUDA) if (link->stream) {cudaError_t cerr = cudaStreamDestroy(link->stream);CHKERRCUDA(cerr); link->stream = NULL;} #elif defined(PETSC_HAVE_HIP) if (link->stream) {hipError_t cerr = hipStreamDestroy(link->stream);CHKERRHIP(cerr); link->stream = NULL;} #endif #endif ierr = PetscFree(link);CHKERRQ(ierr); } *avail = NULL; PetscFunctionReturn(0); } /* Error out on unsupported overlapped communications */ PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata) { PetscErrorCode ierr; PetscSFLink link,*p; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscBool match; PetscFunctionBegin; if (PetscDefined(USE_DEBUG)) { /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore the potential overlapping since this process does not participate in communication. Overlapping is harmless. */ if (rootdata || leafdata) { for (p=&bas->inuse; (link=*p); p=&link->next) { ierr = MPIPetsc_Type_compare(unit,link->unit,&match);CHKERRQ(ierr); if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata); } } } PetscFunctionReturn(0); } static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) { PetscFunctionBegin; if (n) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit) { PetscErrorCode ierr; PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0; PetscBool is2Int,is2PetscInt; PetscMPIInt ni,na,nd,combiner; #if defined(PETSC_HAVE_COMPLEX) PetscInt nPetscComplex=0; #endif PetscFunctionBegin; ierr = MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);CHKERRQ(ierr); ierr = MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);CHKERRQ(ierr); /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */ ierr = MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);CHKERRQ(ierr); ierr = MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);CHKERRQ(ierr); ierr = MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);CHKERRQ(ierr); #if defined(PETSC_HAVE_COMPLEX) ierr = MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);CHKERRQ(ierr); #endif ierr = MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);CHKERRQ(ierr); ierr = MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);CHKERRQ(ierr); /* TODO: shaell we also handle Fortran MPI_2REAL? */ ierr = MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);CHKERRMPI(ierr); link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */ link->bs = 1; /* default */ if (is2Int) { PackInit_PairType_int_int_1_1(link); link->bs = 1; link->unitbytes = 2*sizeof(int); link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ link->basicunit = MPI_2INT; link->unit = MPI_2INT; } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */ PackInit_PairType_PetscInt_PetscInt_1_1(link); link->bs = 1; link->unitbytes = 2*sizeof(PetscInt); link->basicunit = MPIU_2INT; link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */ link->unit = MPIU_2INT; } else if (nPetscReal) { if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link); else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link); else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link); else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link); link->bs = nPetscReal; link->unitbytes = nPetscReal*sizeof(PetscReal); link->basicunit = MPIU_REAL; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;} } else if (nPetscInt) { if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link); else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link); else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link); else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link); link->bs = nPetscInt; link->unitbytes = nPetscInt*sizeof(PetscInt); link->basicunit = MPIU_INT; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;} #if defined(PETSC_USE_64BIT_INDICES) } else if (nInt) { if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link); else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link); else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link); else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link); link->bs = nInt; link->unitbytes = nInt*sizeof(int); link->basicunit = MPI_INT; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;} #endif } else if (nSignedChar) { if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link); else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link); else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link); else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link); link->bs = nSignedChar; link->unitbytes = nSignedChar*sizeof(SignedChar); link->basicunit = MPI_SIGNED_CHAR; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;} } else if (nUnsignedChar) { if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link); else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link); else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link); else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link); link->bs = nUnsignedChar; link->unitbytes = nUnsignedChar*sizeof(UnsignedChar); link->basicunit = MPI_UNSIGNED_CHAR; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;} #if defined(PETSC_HAVE_COMPLEX) } else if (nPetscComplex) { if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link); else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link); else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link); else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link); link->bs = nPetscComplex; link->unitbytes = nPetscComplex*sizeof(PetscComplex); link->basicunit = MPIU_COMPLEX; if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;} #endif } else { MPI_Aint lb,nbyte; ierr = MPI_Type_get_extent(unit,&lb,&nbyte);CHKERRMPI(ierr); if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb); if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */ if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link); else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link); else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link); link->bs = nbyte; link->unitbytes = nbyte; link->basicunit = MPI_BYTE; } else { nInt = nbyte / sizeof(int); if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link); else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link); else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link); else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link); link->bs = nInt; link->unitbytes = nbyte; link->basicunit = MPI_INT; } if (link->isbuiltin) link->unit = unit; } if (!link->isbuiltin) {ierr = MPI_Type_dup(unit,&link->unit);CHKERRMPI(ierr);} link->Memcpy = PetscSFLinkMemcpy_Host; PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)) { PetscFunctionBegin; *UnpackAndOp = NULL; if (mtype == PETSC_MEMTYPE_HOST) { if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd; else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin; else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND; else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND; else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR; else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR; else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR; else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR; else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc; else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc; } #if defined(PETSC_HAVE_DEVICE) else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd; else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin; else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND; else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND; else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR; else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR; else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR; else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR; else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc; else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc; } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd; else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin; else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND; else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND; else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR; else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR; else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR; else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR; else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc; else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc; } #endif PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)) { PetscFunctionBegin; *ScatterAndOp = NULL; if (mtype == PETSC_MEMTYPE_HOST) { if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd; else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin; else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND; else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND; else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR; else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR; else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR; else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR; else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc; else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc; } #if defined(PETSC_HAVE_DEVICE) else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) { if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd; else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin; else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND; else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND; else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR; else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR; else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR; else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR; else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc; else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc; } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) { if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert; else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd; else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult; else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax; else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin; else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND; else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND; else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR; else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR; else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR; else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR; else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc; else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc; } #endif PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)) { PetscFunctionBegin; *FetchAndOp = NULL; if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd; #if defined(PETSC_HAVE_DEVICE) else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd; else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd; #endif PetscFunctionReturn(0); } PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) { PetscFunctionBegin; *FetchAndOpLocal = NULL; if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp"); if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal; #if defined(PETSC_HAVE_DEVICE) else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal; else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal; #endif PetscFunctionReturn(0); } PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) { PetscErrorCode ierr; PetscLogDouble flops; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscFunctionBegin; if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ #if defined(PETSC_HAVE_DEVICE) if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else #endif {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} } PetscFunctionReturn(0); } PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op) { PetscLogDouble flops; PetscErrorCode ierr; PetscFunctionBegin; if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */ flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */ #if defined(PETSC_HAVE_DEVICE) if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = PetscLogGpuFlops(flops);CHKERRQ(ierr);} else #endif {ierr = PetscLogFlops(flops);CHKERRQ(ierr);} } PetscFunctionReturn(0); } /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local. Input Arguments: +sf - The StarForest .link - The link .count - Number of entries to unpack .start - The first index, significent when indices=NULL .indices - Indices of entries in . If NULL, it means indices are contiguous and the first is given in .buf - A contiguous buffer to unpack from -op - Operation after unpack Output Arguments: .data - The data to unpack to */ PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op) { PetscFunctionBegin; #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) { PetscErrorCode ierr; PetscInt i; PetscMPIInt n; if (indices) { /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf. */ for (i=0; iunitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr);} } else { ierr = PetscMPIIntCast(count,&n);CHKERRQ(ierr); ierr = MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);CHKERRMPI(ierr); } } PetscFunctionReturn(0); #else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); #endif } PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op) { PetscFunctionBegin; #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL) { PetscErrorCode ierr; PetscInt i,disp; if (!srcIdx) { ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);CHKERRQ(ierr); } else { for (i=0; iunitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);CHKERRMPI(ierr); } } } PetscFunctionReturn(0); #else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op"); #endif } /*============================================================================= Pack/Unpack/Fetch/Scatter routines ============================================================================*/ /* Pack rootdata to rootbuf Input Arguments: + sf - The SF this packing works on. . link - It gives the memtype of the roots and also provides root buffer. . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately. - rootdata - Where to read the roots. Notes: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not) */ PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata) { PetscErrorCode ierr; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; const PetscInt *rootindices = NULL; PetscInt count,start; PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; PetscMemType rootmtype = link->rootmtype; PetscSFPackOpt opt = NULL; PetscFunctionBegin; ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */ ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetPack(link,rootmtype,&Pack);CHKERRQ(ierr); ierr = (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); } if (scope == PETSCSF_REMOTE) { ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); ierr = PetscSFLinkSyncStreamAfterPackRootData(sf,link);CHKERRQ(ierr); } ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Pack leafdata to leafbuf */ PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata) { PetscErrorCode ierr; const PetscInt *leafindices = NULL; PetscInt count,start; PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; PetscMemType leafmtype = link->leafmtype; PetscSFPackOpt opt = NULL; PetscFunctionBegin; ierr = PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncDeviceBeforePackData(sf,link);CHKERRQ(ierr);} if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */ ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); ierr = PetscSFLinkGetPack(link,leafmtype,&Pack);CHKERRQ(ierr); ierr = (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); } if (scope == PETSCSF_REMOTE) { ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);CHKERRQ(ierr); ierr = PetscSFLinkSyncStreamAfterPackLeafData(sf,link);CHKERRQ(ierr); } ierr = PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Unpack rootbuf to rootdata */ PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) { PetscErrorCode ierr; const PetscInt *rootindices = NULL; PetscInt count,start; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; PetscMemType rootmtype = link->rootmtype; PetscSFPackOpt opt = NULL; PetscFunctionBegin; ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */ ierr = PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);CHKERRQ(ierr); if (UnpackAndOp) { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); ierr = (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);CHKERRQ(ierr); } } if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr);} ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Unpack leafbuf to leafdata */ PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op) { PetscErrorCode ierr; const PetscInt *leafindices = NULL; PetscInt count,start; PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL; PetscMemType leafmtype = link->leafmtype; PetscSFPackOpt opt = NULL; PetscFunctionBegin; ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */ ierr = PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);CHKERRQ(ierr); if (UnpackAndOp) { ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); ierr = (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);CHKERRQ(ierr); ierr = PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);CHKERRQ(ierr); } } if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);CHKERRQ(ierr);} ierr = PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);CHKERRQ(ierr); ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); } /* FetchAndOp rootdata with rootbuf */ PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op) { PetscErrorCode ierr; const PetscInt *rootindices = NULL; PetscInt count,start; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL; PetscMemType rootmtype = link->rootmtype; PetscSFPackOpt opt = NULL; PetscFunctionBegin; ierr = PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); if (scope == PETSCSF_REMOTE) {ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);CHKERRQ(ierr);} if (bas->rootbuflen[scope]) { /* Do FetchAndOp on rootdata with rootbuf */ ierr = PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);CHKERRQ(ierr); ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);CHKERRQ(ierr); ierr = (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);CHKERRQ(ierr); } if (scope == PETSCSF_REMOTE) { ierr = PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);CHKERRQ(ierr); ierr = PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);CHKERRQ(ierr); } ierr = PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);CHKERRQ(ierr); ierr = PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */ PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op) { PetscErrorCode ierr; const PetscInt *rootindices = NULL,*leafindices = NULL; PetscInt count,rootstart,leafstart; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; PetscSFPackOpt leafopt = NULL,rootopt = NULL; PetscFunctionBegin; if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); if (rootmtype != leafmtype) { /* Uncommon case */ /* The local communication has to go through pack and unpack */ ierr = PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);CHKERRQ(ierr); ierr = (*link->Memcpy)(link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); ierr = PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); if (ScatterAndOp) { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); ierr = (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);CHKERRQ(ierr); } } PetscFunctionReturn(0); } /* Reduce leafdata to rootdata locally */ PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op) { PetscErrorCode ierr; const PetscInt *rootindices = NULL,*leafindices = NULL; PetscInt count,rootstart,leafstart; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL; const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; PetscSFPackOpt leafopt = NULL,rootopt = NULL; PetscFunctionBegin; if (!sf->leafbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); if (rootmtype != leafmtype) { /* The local communication has to go through pack and unpack */ ierr = PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);CHKERRQ(ierr); ierr = (*link->Memcpy)(link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);CHKERRQ(ierr); ierr = PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);CHKERRQ(ierr); if (ScatterAndOp) { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); ierr = (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);CHKERRQ(ierr); } else { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); ierr = PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);CHKERRQ(ierr); } } PetscFunctionReturn(0); } /* Fetch rootdata to leafdata and leafupdate locally */ PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op) { PetscErrorCode ierr; const PetscInt *rootindices = NULL,*leafindices = NULL; PetscInt count,rootstart,leafstart; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL; const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype; PetscSFPackOpt leafopt = NULL,rootopt = NULL; PetscFunctionBegin; if (!bas->rootbuflen[PETSCSF_LOCAL]) PetscFunctionReturn(0); if (rootmtype != leafmtype) { /* The local communication has to go through pack and unpack */ SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU"); } else { ierr = PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);CHKERRQ(ierr); ierr = PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);CHKERRQ(ierr); ierr = PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);CHKERRQ(ierr); ierr = (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* Create per-rank pack/unpack optimizations based on indice patterns Input Parameters: + n - Number of destination ranks . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0. - idx - [*] Array storing indices Output Parameters: + opt - Pack optimizations. NULL if no optimizations. */ PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out) { PetscErrorCode ierr; PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y; PetscBool optimizable = PETSC_TRUE; PetscSFPackOpt opt; PetscFunctionBegin; ierr = PetscMalloc1(1,&opt);CHKERRQ(ierr); ierr = PetscMalloc1(7*n+2,&opt->array);CHKERRQ(ierr); opt->n = opt->array[0] = n; opt->offset = opt->array + 1; opt->start = opt->array + n + 2; opt->dx = opt->array + 2*n + 2; opt->dy = opt->array + 3*n + 2; opt->dz = opt->array + 4*n + 2; opt->X = opt->array + 5*n + 2; opt->Y = opt->array + 6*n + 2; for (r=0; r 1 ? (idx[p]-start) : dx; /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */ if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;} for (dy=1; dy 1 ? (idx[p]-start)/X : dy; /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */ if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;} for (k=1; kstart[r] = start; opt->dx[r] = dx; opt->dy[r] = dy; opt->dz[r] = dz; opt->X[r] = X; opt->Y[r] = Y; } finish: /* If not optimizable, free arrays to save memory */ if (!n || !optimizable) { ierr = PetscFree(opt->array);CHKERRQ(ierr); ierr = PetscFree(opt);CHKERRQ(ierr); *out = NULL; } else { opt->offset[0] = 0; for (r=0; roffset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r]; *out = opt; } PetscFunctionReturn(0); } PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out) { PetscErrorCode ierr; PetscSFPackOpt opt = *out; PetscFunctionBegin; if (opt) { ierr = PetscSFFree(sf,mtype,opt->array);CHKERRQ(ierr); ierr = PetscFree(opt);CHKERRQ(ierr); *out = NULL; } PetscFunctionReturn(0); } PetscErrorCode PetscSFSetUpPackFields(PetscSF sf) { PetscErrorCode ierr; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscInt i,j; PetscFunctionBegin; /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */ for (i=0; i<2; i++) { /* Set defaults */ sf->leafstart[i] = 0; sf->leafcontig[i] = PETSC_TRUE; sf->leafdups[i] = PETSC_FALSE; bas->rootstart[i] = 0; bas->rootcontig[i] = PETSC_TRUE; bas->rootdups[i] = PETSC_FALSE; } sf->leafbuflen[0] = sf->roffset[sf->ndranks]; sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks]; if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0]; if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]]; /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */ for (i=0; iroffset[sf->ndranks]; i++) { /* self */ if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;} } for (i=sf->roffset[sf->ndranks],j=0; iroffset[sf->nranks]; i++,j++) { /* remote */ if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;} } /* If not, see if we can have per-rank optimizations by doing index analysis */ if (!sf->leafcontig[0]) {ierr = PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);CHKERRQ(ierr);} if (!sf->leafcontig[1]) {ierr = PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);CHKERRQ(ierr);} /* Are root indices for self and remote contiguous? */ bas->rootbuflen[0] = bas->ioffset[bas->ndiranks]; bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks]; if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0]; if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]]; for (i=0; iioffset[bas->ndiranks]; i++) { if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;} } for (i=bas->ioffset[bas->ndiranks],j=0; iioffset[bas->niranks]; i++,j++) { if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;} } if (!bas->rootcontig[0]) {ierr = PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);CHKERRQ(ierr);} if (!bas->rootcontig[1]) {ierr = PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);CHKERRQ(ierr);} #if defined(PETSC_HAVE_DEVICE) /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */ if (PetscDefined(HAVE_DEVICE)) { PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE; if (!sf->leafcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[0], sf->rmine, &sf->leafdups[0]);CHKERRQ(ierr);} if (!sf->leafcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(sf->leafbuflen[1], sf->rmine+sf->roffset[sf->ndranks], &sf->leafdups[1]);CHKERRQ(ierr);} if (!bas->rootcontig[0] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[0], bas->irootloc, &bas->rootdups[0]);CHKERRQ(ierr);} if (!bas->rootcontig[1] && !ismulti) {ierr = PetscCheckDupsInt(bas->rootbuflen[1], bas->irootloc+bas->ioffset[bas->ndiranks], &bas->rootdups[1]);CHKERRQ(ierr);} } #endif PetscFunctionReturn(0); } PetscErrorCode PetscSFResetPackFields(PetscSF sf) { PetscErrorCode ierr; PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; PetscInt i; PetscFunctionBegin; for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) { ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);CHKERRQ(ierr); ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);CHKERRQ(ierr); #if defined(PETSC_HAVE_DEVICE) ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);CHKERRQ(ierr); ierr = PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);CHKERRQ(ierr); #endif } PetscFunctionReturn(0); }