140e23c03SJunchao Zhang #if !defined(__SFPACK_H) 240e23c03SJunchao Zhang #define __SFPACK_H 340e23c03SJunchao Zhang 440e23c03SJunchao Zhang #include <petsc/private/sfimpl.h> /*I "petscsf.h" I*/ 540e23c03SJunchao Zhang 640e23c03SJunchao Zhang /* Optimization plans in packing(unpacking) for target processors. 740e23c03SJunchao Zhang 840e23c03SJunchao Zhang Indirect accesses in packing like p[i] = u[idx[i]] are expensive and are not vectorization friendly. We 940e23c03SJunchao Zhang try to optimize them if we found cenrtain patterns among indices in idx[]. As a result, a pack might be 1040e23c03SJunchao Zhang optimized into 1) a small number of contiguous memory copies; OR 2) one strided memory copy. 1140e23c03SJunchao Zhang 1240e23c03SJunchao Zhang Each target has its own plan. n, the number of target processors, is nranks or niranks depending on the context. 1340e23c03SJunchao Zhang */ 1440e23c03SJunchao Zhang struct _n_PetscSFPackOpt { 1540e23c03SJunchao Zhang PetscBool *optimized; /* [n] Is the packing to i-th target optimized? If yes, other fields give the opt plan */ 1640e23c03SJunchao Zhang PetscInt *copy_offset; /* [n+1] We number all memory copies. Packing for target i is optimized into copies in [copy_offset[i],copy_offset[i+1]) */ 1740e23c03SJunchao Zhang PetscInt *copy_start; /* [*] j-th copy starts at index copy_start[j] */ 1840e23c03SJunchao Zhang PetscInt *copy_length; /* [*] with length copy_length[j] in unit of the <unit> used in for example, PetscSFReduceBegin(sf,unit,...) */ 1940e23c03SJunchao Zhang PetscInt *stride_first; /* [n] If optimized[i] is TRUE but copy_offset[i] == copy_offset[i+1], then packing for remote i is strided. The first */ 2040e23c03SJunchao Zhang PetscInt *stride_step; /* [n] index is stride_first[i], step is stride_step[i], */ 2140e23c03SJunchao Zhang PetscInt *stride_n; /* [n] and total stride_n[i] steps */ 2240e23c03SJunchao Zhang }; 2340e23c03SJunchao Zhang 2440e23c03SJunchao Zhang typedef struct _n_PetscSFPack* PetscSFPack; 2540e23c03SJunchao Zhang 2640e23c03SJunchao Zhang #define SFPACKHEADER \ 2740e23c03SJunchao Zhang PetscErrorCode (*Pack) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,const void*,void*); \ 2840e23c03SJunchao Zhang PetscErrorCode (*UnpackAndInsert)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 2940e23c03SJunchao Zhang PetscErrorCode (*UnpackAndAdd) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3040e23c03SJunchao Zhang PetscErrorCode (*UnpackAndMin) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3140e23c03SJunchao Zhang PetscErrorCode (*UnpackAndMax) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3240e23c03SJunchao Zhang PetscErrorCode (*UnpackAndMinloc)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3340e23c03SJunchao Zhang PetscErrorCode (*UnpackAndMaxloc)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3440e23c03SJunchao Zhang PetscErrorCode (*UnpackAndMult) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3540e23c03SJunchao Zhang PetscErrorCode (*UnpackAndLAND) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3640e23c03SJunchao Zhang PetscErrorCode (*UnpackAndBAND) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3740e23c03SJunchao Zhang PetscErrorCode (*UnpackAndLOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3840e23c03SJunchao Zhang PetscErrorCode (*UnpackAndBOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 3940e23c03SJunchao Zhang PetscErrorCode (*UnpackAndLXOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 4040e23c03SJunchao Zhang PetscErrorCode (*UnpackAndBXOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*); \ 4140e23c03SJunchao Zhang PetscErrorCode (*FetchAndInsert) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4240e23c03SJunchao Zhang PetscErrorCode (*FetchAndAdd) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4340e23c03SJunchao Zhang PetscErrorCode (*FetchAndMin) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4440e23c03SJunchao Zhang PetscErrorCode (*FetchAndMax) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4540e23c03SJunchao Zhang PetscErrorCode (*FetchAndMinloc) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4640e23c03SJunchao Zhang PetscErrorCode (*FetchAndMaxloc) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4740e23c03SJunchao Zhang PetscErrorCode (*FetchAndMult) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4840e23c03SJunchao Zhang PetscErrorCode (*FetchAndLAND) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 4940e23c03SJunchao Zhang PetscErrorCode (*FetchAndBAND) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 5040e23c03SJunchao Zhang PetscErrorCode (*FetchAndLOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 5140e23c03SJunchao Zhang PetscErrorCode (*FetchAndBOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 5240e23c03SJunchao Zhang PetscErrorCode (*FetchAndLXOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 5340e23c03SJunchao Zhang PetscErrorCode (*FetchAndBXOR) (PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*); \ 5440e23c03SJunchao Zhang PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ \ 5540e23c03SJunchao Zhang MPI_Datatype unit; \ 5640e23c03SJunchao Zhang MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ \ 5740e23c03SJunchao Zhang PetscBool isbuiltin; /* Is unit an MPI builtin datatype? If it is true, basicunit=unit, bs=1 */ \ 5840e23c03SJunchao Zhang size_t unitbytes; /* Number of bytes in a unit */ \ 5940e23c03SJunchao Zhang PetscInt bs; /* Number of basic units in a unit */ \ 60*9d1c8addSJunchao Zhang const void *rkey,*lkey; /* rootdata and leafdata used as keys for operation */ \ 6140e23c03SJunchao Zhang PetscSFPack next 6240e23c03SJunchao Zhang 6340e23c03SJunchao Zhang /* An abstract class that defines a communication link, which includes how to 6440e23c03SJunchao Zhang pack/unpack data. Subclasses may further contain fields for send/recv buffers, 6540e23c03SJunchao Zhang MPI_Requests etc used in communication. 6640e23c03SJunchao Zhang */ 6740e23c03SJunchao Zhang struct _n_PetscSFPack { 6840e23c03SJunchao Zhang SFPACKHEADER; 6940e23c03SJunchao Zhang }; 7040e23c03SJunchao Zhang 71*9d1c8addSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFPack*); 7240e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackReclaim(PetscSF,PetscSFPack*); 7340e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetupType(PetscSFPack,MPI_Datatype); 7440e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetUnpackAndOp(PetscSF,PetscSFPack,MPI_Op,PetscErrorCode (**UnpackAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,const void*)); 7540e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackGetFetchAndOp(PetscSF,PetscSFPack,MPI_Op,PetscErrorCode (**FetchAndOp)(PetscInt,PetscInt,const PetscInt*,PetscInt,PetscSFPackOpt,void*,void*)); 7640e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetupOptimization(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*); 7740e23c03SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackDestoryOptimization(PetscSFPackOpt *out); 78*9d1c8addSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFPackSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 79*9d1c8addSJunchao Zhang 8040e23c03SJunchao Zhang #endif 81