140e23c03SJunchao Zhang #if !defined(__SFPACK_H) 240e23c03SJunchao Zhang #define __SFPACK_H 340e23c03SJunchao Zhang 4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 5cd620004SJunchao Zhang 6cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 7cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 840e23c03SJunchao Zhang 9b23bfdefSJunchao Zhang /* Optimization plans in packing & unpacking for destination ranks. 1040e23c03SJunchao Zhang 11b23bfdefSJunchao Zhang Suppose there are count indices stored in idx[], and two addresses u, p. We want to do packing: 12b23bfdefSJunchao Zhang p[i] = u[idx[i]], for i in [0,count) 1340e23c03SJunchao Zhang 14b23bfdefSJunchao Zhang Often, the indices are associated with n ranks. Each rank's indices are stored consecutively in idx[]. 15b23bfdefSJunchao Zhang We analyze indices for each rank and see if they are patterns that can be used to optimize the packing. 16eb02082bSJunchao Zhang The result is stored in PetscSFPackOpt. Packing for a rank might be non-optimizable, or optimized into 17eb02082bSJunchao Zhang a small number of contiguous memory copies or one strided memory copy. 18cd620004SJunchao Zhang 19cd620004SJunchao Zhang Note before using this per-rank optimization, one should check rleafloccontig, irootloccontig, which say 20cd620004SJunchao Zhang indices in whole start from 0 and are contiguous, and therefore much useful when true. 2140e23c03SJunchao Zhang */ 22b23bfdefSJunchao Zhang typedef enum {PETSCSF_PACKOPT_NONE=0, PETSCSF_PACKOPT_MULTICOPY, PETSCSF_PACKOPT_STRIDE} PetscSFPackOptType; 2340e23c03SJunchao Zhang struct _n_PetscSFPackOpt { 24b23bfdefSJunchao Zhang PetscInt n; /* Number of destination ranks */ 25b23bfdefSJunchao Zhang PetscSFPackOptType *type; /* [n] Optimization types for the n ranks */ 26b23bfdefSJunchao Zhang PetscInt *offset; /* [n+1] Indices for i-th rank are in [offset[i],offset[i+1]) of idx[] */ 27b23bfdefSJunchao Zhang PetscInt *copy_offset; /* [n+1] If type[i] = PETSCSF_PACKOPT_MULTICOPY, packing for i-th rank is optimized into copies numbered between [copy_offset[i],copy_offset[i+1]) */ 28b23bfdefSJunchao Zhang PetscInt *copy_start; /* [*] j-th copy starts at copy_start[j] in idx[]. In other words, there are copy_length[j] contiguous indices */ 29eb02082bSJunchao Zhang PetscInt *copy_length; /* [*] starting at idx[copy_start[j]] */ 30b23bfdefSJunchao Zhang PetscInt *stride_step; /* [n] If type[i] = PETSCSF_PACKOPT_STRIDE, then packing for i-th rank is strided, with first index being idx[offset[i]] and step stride_step[i], */ 3140e23c03SJunchao Zhang PetscInt *stride_n; /* [n] and total stride_n[i] steps */ 3240e23c03SJunchao Zhang }; 3340e23c03SJunchao Zhang 34eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 3540e23c03SJunchao Zhang */ 36cd620004SJunchao Zhang struct _n_PetscSFLink { /* link, count, start, indices, pack plan, root/leafdata, buffer*/ 37cd620004SJunchao Zhang PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*); 38cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 39cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 40cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 41cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 42cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 43cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 44cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 45cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 46cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 47cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 48cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 49cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 50cd620004SJunchao Zhang PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 51cd620004SJunchao Zhang PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 52cd620004SJunchao Zhang /* link, count, startx, idx, xdata, starty, idy, ydata */ 53cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 54cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 55cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 56cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 57cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 58cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 59cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 60cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 61cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 62cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 63cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 64cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 65cd620004SJunchao Zhang PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 66cd620004SJunchao Zhang /* link, count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,leafupdate*/ 67cd620004SJunchao Zhang PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 68cd620004SJunchao Zhang 69cd620004SJunchao Zhang PetscBool deviceinited; /* Are device related fields initialized? */ 70eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 71eb02082bSJunchao Zhang /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 72eb02082bSJunchao Zhang will set them, otherwise it just leaves them alone even though PETSC_HAVE_CUDA. Packing routines using 73eb02082bSJunchao Zhang regular ops when there are no data race chances. 74eb02082bSJunchao Zhang */ 75cd620004SJunchao Zhang PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*); 76eb02082bSJunchao Zhang 77cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 78cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 79cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 80cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 81cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 82cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 83cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 84cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 85cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 86cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 87cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 88cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 89cd620004SJunchao Zhang PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 90cd620004SJunchao Zhang PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 91cd620004SJunchao Zhang 92cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 93cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 94cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 95cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 96cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 97cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 98cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 99cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 100cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 101cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 102cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 103cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 104cd620004SJunchao Zhang PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 105cd620004SJunchao Zhang PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 106eb02082bSJunchao Zhang 107eb02082bSJunchao Zhang /* Packing routines using atomics when there are data race chances */ 108cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 109cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 110cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 111cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 112cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 113cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 114cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 115cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 116cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 117cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 118cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 119cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 120cd620004SJunchao Zhang PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*); 121cd620004SJunchao Zhang PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*, void*); 122cd620004SJunchao Zhang 123cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 124cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 125cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 126cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 127cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 128cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 129cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 130cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 131cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 132cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 133cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 134cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 135cd620004SJunchao Zhang PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*); 136cd620004SJunchao Zhang PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*); 137eb02082bSJunchao Zhang 138e315309dSJunchao Zhang PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 139eb02082bSJunchao Zhang cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 140eb02082bSJunchao Zhang #endif 141eb02082bSJunchao Zhang PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 142cd620004SJunchao Zhang MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 143eb02082bSJunchao Zhang MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 144e07844bfSJunchao Zhang PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 145eb02082bSJunchao Zhang size_t unitbytes; /* Number of bytes in a unit */ 146eb02082bSJunchao Zhang PetscInt bs; /* Number of basic units in a unit */ 147cd620004SJunchao Zhang const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 148cd620004SJunchao Zhang PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 149cd620004SJunchao Zhang 150cd620004SJunchao Zhang /* For local and remote communication */ 151cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 152cd620004SJunchao Zhang PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 153cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 154cd620004SJunchao Zhang const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 155cd620004SJunchao Zhang const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 156cd620004SJunchao Zhang char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 157cd620004SJunchao Zhang char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 158cd620004SJunchao Zhang char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 159cd620004SJunchao Zhang char *leafbuf_alloc[2][2]; 160cd620004SJunchao Zhang MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 161cd620004SJunchao Zhang MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 162cd620004SJunchao Zhang PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 163cd620004SJunchao Zhang PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 164cd620004SJunchao Zhang MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 165cd620004SJunchao Zhang PetscSFLink next; 16640e23c03SJunchao Zhang }; 16740e23c03SJunchao Zhang 168cd620004SJunchao Zhang #if defined(PETSC_USE_DEBUG) 169cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 170cd620004SJunchao Zhang #else 171cd620004SJunchao Zhang #define PetscSFSetErrorOnUnsupportedOverlap(a,b,c,d) 0 172cd620004SJunchao Zhang #endif 173b7c0d12aSJunchao Zhang 174cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */ 175cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 176cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 177cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 178cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Device(PetscSF,PetscSFLink,MPI_Datatype); 179cd620004SJunchao Zhang #else 180cd620004SJunchao Zhang #define PetscSFLinkSetUp_Device(a,b,c) 0 181cd620004SJunchao Zhang #endif 182cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 183cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 184cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 185cd620004SJunchao Zhang 186cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */ 187cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,const void*,void*)) 188eb02082bSJunchao Zhang { 189eb02082bSJunchao Zhang PetscFunctionBegin; 190eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 191eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 192cd620004SJunchao Zhang else *Pack = link->d_Pack; 193eb02082bSJunchao Zhang #endif 194eb02082bSJunchao Zhang PetscFunctionReturn(0); 195eb02082bSJunchao Zhang } 196cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,const void*)); 197cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,const PetscInt*,PetscSFPackOpt,void*,void*)); 198cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,const void*,PetscInt,const PetscInt*,void*)); 199cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,const PetscInt*,void*,PetscInt,const PetscInt*,const void*,void*)); 200cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 201b7c0d12aSJunchao Zhang 202cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */ 203cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 204cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 205cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 206cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 207cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 208cd620004SJunchao Zhang 209cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 210cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 211cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 212cd620004SJunchao Zhang 213cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF sf); 214cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF sf); 215cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFCreatePackOpt(PetscInt,const PetscInt*,const PetscInt*,PetscSFPackOpt*); 216cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFDestroyPackOpt(PetscSFPackOpt *out); 217cd620004SJunchao Zhang 218cd620004SJunchao Zhang 219cd620004SJunchao Zhang /* Get root indices used for pack/unpack 220cd620004SJunchao Zhang 221cd620004SJunchao Zhang Input arguments: 222cd620004SJunchao Zhang +sf - StarForest 223cd620004SJunchao Zhang .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 224cd620004SJunchao Zhang .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 225cd620004SJunchao Zhang .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 226cd620004SJunchao Zhang 227cd620004SJunchao Zhang Output arguments: 228cd620004SJunchao Zhang +count - Count of indices 229cd620004SJunchao Zhang .start - The first index (only useful when indices is NULL) 230cd620004SJunchao Zhang -indices - indices of roots for pack/unpack. NULL means indices are contiguous 231cd620004SJunchao Zhang */ 232cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices) 233b7c0d12aSJunchao Zhang { 234cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 235cd620004SJunchao Zhang PetscInt offset; 236b7c0d12aSJunchao Zhang 237b7c0d12aSJunchao Zhang PetscFunctionBegin; 238cd620004SJunchao Zhang if (count) *count = bas->rootbuflen[scope]; 239cd620004SJunchao Zhang if (start) *start = bas->rootstart[scope]; 240cd620004SJunchao Zhang if (indices) { 241cd620004SJunchao Zhang if (bas->rootcontig[scope]) *indices = NULL; 242cd620004SJunchao Zhang else { 243cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 244cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*indices = bas->irootloc + offset;} 245cd620004SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 246cd620004SJunchao Zhang else { 247cd620004SJunchao Zhang if (!bas->irootloc_d[scope]) { 248cd620004SJunchao Zhang cudaError_t cerr; 249cd620004SJunchao Zhang size_t size = bas->rootbuflen[scope]*sizeof(PetscInt); 250cd620004SJunchao Zhang cerr = cudaMalloc((void **)&bas->irootloc_d[scope],size);CHKERRCUDA(cerr); 251cd620004SJunchao Zhang cerr = cudaMemcpyAsync(bas->irootloc_d[scope],bas->irootloc+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 252b7c0d12aSJunchao Zhang } 253cd620004SJunchao Zhang *indices = bas->irootloc_d[scope]; 254cd620004SJunchao Zhang } 255cd620004SJunchao Zhang #endif 256cd620004SJunchao Zhang } 257cd620004SJunchao Zhang } 258b7c0d12aSJunchao Zhang PetscFunctionReturn(0); 259b7c0d12aSJunchao Zhang } 260b7c0d12aSJunchao Zhang 261cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack 262cd620004SJunchao Zhang 263cd620004SJunchao Zhang See also PetscSFLinkGetRootIndices() 264cd620004SJunchao Zhang */ 265cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,const PetscInt **indices) 266cd620004SJunchao Zhang { 267cd620004SJunchao Zhang PetscInt offset; 268cd620004SJunchao Zhang 269cd620004SJunchao Zhang PetscFunctionBegin; 270cd620004SJunchao Zhang if (count) *count = sf->leafbuflen[scope]; 271cd620004SJunchao Zhang if (start) *start = sf->leafstart[scope]; 272cd620004SJunchao Zhang if (indices) { 273cd620004SJunchao Zhang if (sf->leafcontig[scope]) *indices = NULL; 274cd620004SJunchao Zhang else { 275cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 276cd620004SJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*indices = sf->rmine + offset;} 277eb02082bSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 278cd620004SJunchao Zhang else { 279cd620004SJunchao Zhang if (!sf->rmine_d[scope]) { 280cd620004SJunchao Zhang cudaError_t cerr; 281cd620004SJunchao Zhang size_t size = sf->leafbuflen[scope]*sizeof(PetscInt); 282cd620004SJunchao Zhang cerr = cudaMalloc((void **)&sf->rmine_d[scope],size);CHKERRCUDA(cerr); 283cd620004SJunchao Zhang cerr = cudaMemcpyAsync(sf->rmine_d[scope],sf->rmine+offset,size,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr); 284cd620004SJunchao Zhang } 285cd620004SJunchao Zhang *indices = sf->rmine_d[scope]; 286cd620004SJunchao Zhang } 287eb02082bSJunchao Zhang #endif 288cd620004SJunchao Zhang } 289cd620004SJunchao Zhang } 290cd620004SJunchao Zhang PetscFunctionReturn(0); 291cd620004SJunchao Zhang } 292cd620004SJunchao Zhang 293cd620004SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 294cd620004SJunchao Zhang { 295cd620004SJunchao Zhang PetscErrorCode ierr; 296cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 297cd620004SJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 298cd620004SJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 299cd620004SJunchao Zhang 300cd620004SJunchao Zhang PetscFunctionBegin; 301cd620004SJunchao Zhang ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 302cd620004SJunchao Zhang ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 303cd620004SJunchao Zhang PetscFunctionReturn(0); 304cd620004SJunchao Zhang } 305cd620004SJunchao Zhang 306*f01131f0SJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMemcpy(PetscSF sf,PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n) 307*f01131f0SJunchao Zhang { 308*f01131f0SJunchao Zhang PetscFunctionBegin; 309*f01131f0SJunchao Zhang if (n) { 310*f01131f0SJunchao Zhang if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_HOST) {PetscErrorCode ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 311*f01131f0SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 312*f01131f0SJunchao Zhang else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_HOST) { 313*f01131f0SJunchao Zhang cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(err); 314*f01131f0SJunchao Zhang PetscErrorCode ierr = PetscLogCpuToGpu(n);CHKERRQ(ierr); 315*f01131f0SJunchao Zhang } else if (dstmtype == PETSC_MEMTYPE_HOST && srcmtype == PETSC_MEMTYPE_DEVICE) { 316*f01131f0SJunchao Zhang cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(err); 317*f01131f0SJunchao Zhang PetscErrorCode ierr = PetscLogGpuToCpu(n);CHKERRQ(ierr); 318*f01131f0SJunchao Zhang } else if (dstmtype == PETSC_MEMTYPE_DEVICE && srcmtype == PETSC_MEMTYPE_DEVICE) {cudaError_t err = cudaMemcpyAsync(dst,src,n,cudaMemcpyDeviceToDevice,link->stream);CHKERRCUDA(err);} 319*f01131f0SJunchao Zhang #endif 320*f01131f0SJunchao Zhang else SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong PetscMemType for dst %d and src %d",(int)dstmtype,(int)srcmtype); 321*f01131f0SJunchao Zhang } 322*f01131f0SJunchao Zhang PetscFunctionReturn(0); 323*f01131f0SJunchao Zhang } 32440e23c03SJunchao Zhang #endif 325