140e23c03SJunchao Zhang #if !defined(__SFPACK_H) 240e23c03SJunchao Zhang #define __SFPACK_H 340e23c03SJunchao Zhang 4cd620004SJunchao Zhang #include <../src/vec/is/sf/impls/basic/sfbasic.h> 57fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 67fd2d3dbSJunchao Zhang #include <cuda_runtime.h> /* For cudaStream_t */ 77fd2d3dbSJunchao Zhang #include <petsccublas.h> /* For CHKERRCUDA */ 87fd2d3dbSJunchao Zhang #endif 97fd2d3dbSJunchao Zhang 107fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_HIP) 117fd2d3dbSJunchao Zhang #include <hip/hip_runtime.h> /* For hipStream_t */ 127fd2d3dbSJunchao Zhang #endif 13cd620004SJunchao Zhang 14*874d28e3SJunchao Zhang /* In terms of function overloading, long long int is a different type than int64_t, which PetscInt might be defined to. 15*874d28e3SJunchao Zhang We perfer long long int over PetscInt (int64_t), since CUDA atomics are built around (unsigned) long long int. 16*874d28e3SJunchao Zhang */ 17*874d28e3SJunchao Zhang typedef long long int llint; 18*874d28e3SJunchao Zhang typedef unsigned long long int ullint; 19*874d28e3SJunchao Zhang 20cd620004SJunchao Zhang /* We separate SF communications for SFBasic and SFNeighbor in two parts: local (self,intra-rank) and remote (inter-rank) */ 21cd620004SJunchao Zhang typedef enum {PETSCSF_LOCAL=0, PETSCSF_REMOTE} PetscSFScope; 2240e23c03SJunchao Zhang 23fcc7397dSJunchao Zhang /* Optimizations in packing & unpacking for destination ranks. 2440e23c03SJunchao Zhang 25fcc7397dSJunchao Zhang Suppose there are m indices stored in idx[], and two addresses u, p. We want to do packing: 26fcc7397dSJunchao Zhang p[i] = u[idx[i]], for i in [0,m) 2740e23c03SJunchao Zhang 28fcc7397dSJunchao Zhang Indices are associated with n ranks and each rank's indices are stored consecutively in idx[]. 29fcc7397dSJunchao Zhang We go through indices for each rank and see if they are indices of a 3D submatrix of size [dx,dy,dz] in 30fcc7397dSJunchao Zhang a parent matrix of size [X,Y,Z], with the submatrix's first index being <start>. 31cd620004SJunchao Zhang 32fcc7397dSJunchao Zhang E.g., for indices 1,2,3, 6,7,8, 11,12,13, the submatrix size is [3,3,1] with start=1, and the parent matrix's size 33fcc7397dSJunchao Zhang is [5,3,1]. For simplicity, if any destination rank does not have this pattern, we give up the optimization. 34fcc7397dSJunchao Zhang 35fcc7397dSJunchao Zhang Note before using this per-rank optimization, one should check leafcontig[], rootcontig[], which say 36fcc7397dSJunchao Zhang indices in whole are contiguous, and therefore much more useful than this one when true. 3740e23c03SJunchao Zhang */ 3840e23c03SJunchao Zhang struct _n_PetscSFPackOpt { 39fcc7397dSJunchao Zhang PetscInt *array; /* [7*n+2] Memory pool for other fields in this struct. Used to easily copy this struct to GPU */ 40b23bfdefSJunchao Zhang PetscInt n; /* Number of destination ranks */ 41fcc7397dSJunchao Zhang PetscInt *offset; /* [n+1] Offsets of indices for each rank. offset[0]=0, offset[i+1]=offset[i]+dx[i]*dy[i]*dz[i] */ 42fcc7397dSJunchao Zhang PetscInt *start; /* [n] First index */ 43fcc7397dSJunchao Zhang PetscInt *dx,*dy,*dz; /* [n] Lengths of the submatrix in X, Y, Z dimension. */ 44fcc7397dSJunchao Zhang PetscInt *X,*Y; /* [n] Lengths of the outer matrix in X, Y. We do not care Z. */ 4540e23c03SJunchao Zhang }; 4640e23c03SJunchao Zhang 47eb02082bSJunchao Zhang /* An abstract class that defines a communication link, which includes how to pack/unpack data and send/recv buffers 4840e23c03SJunchao Zhang */ 49fcc7397dSJunchao Zhang struct _n_PetscSFLink { 5020c24465SJunchao Zhang PetscErrorCode (*Memcpy) (PetscSFLink,PetscMemType,void*,PetscMemType,const void*,size_t); /* Asynchronous copy might use stream in the link */ 5120c24465SJunchao Zhang 52fcc7397dSJunchao Zhang PetscErrorCode (*h_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 53fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 54fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 55fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 56fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 57fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 58fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 59fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 60fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 61fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 62fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 63fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 64fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 65fcc7397dSJunchao Zhang PetscErrorCode (*h_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 66fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 67fcc7397dSJunchao Zhang 68fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 69fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 70fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 71fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 72fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 73fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 74fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 75fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 76fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 77fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 78fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 79fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 80fcc7397dSJunchao Zhang PetscErrorCode (*h_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 81fcc7397dSJunchao Zhang 82fcc7397dSJunchao Zhang PetscErrorCode (*h_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 83cd620004SJunchao Zhang 84cd620004SJunchao Zhang PetscBool deviceinited; /* Are device related fields initialized? */ 857fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 86eb02082bSJunchao Zhang /* These fields are lazily initialized in a sense that only when device pointers are passed to an SF, the SF 877fd2d3dbSJunchao Zhang will set them, otherwise it just leaves them alone. Packing routines using regular ops when there are no data race chances. 88eb02082bSJunchao Zhang */ 8920c24465SJunchao Zhang PetscErrorCode (*d_SyncDevice) (PetscSFLink); 9020c24465SJunchao Zhang PetscErrorCode (*d_SyncStream) (PetscSFLink); 9120c24465SJunchao Zhang 92fcc7397dSJunchao Zhang PetscErrorCode (*d_Pack) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 93fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndInsert) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 94fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 95fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 96fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 97fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMinloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 98fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMaxloc) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 99fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 100fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 101fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 102fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 103fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 104fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 105fcc7397dSJunchao Zhang PetscErrorCode (*d_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 106fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 107eb02082bSJunchao Zhang 108fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 109fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 110fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 111fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 112fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 113fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 114fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 115fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 116fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 117fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 118fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 119fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 120fcc7397dSJunchao Zhang PetscErrorCode (*d_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 121fcc7397dSJunchao Zhang PetscErrorCode (*d_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 122eb02082bSJunchao Zhang 123eb02082bSJunchao Zhang /* Packing routines using atomics when there are data race chances */ 124fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 125fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 126fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 127fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 128fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 129fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 130fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 131fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 132fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 133fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 134fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 135fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 136fcc7397dSJunchao Zhang PetscErrorCode (*da_UnpackAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*); 137fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*, void*); 138cd620004SJunchao Zhang 139fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndInsert)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 140fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndAdd) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 141fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMin) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 142fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMax) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 143fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMinloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 144fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMaxloc)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 145fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndMult) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 146fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 147fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBAND) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 148fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 149fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 150fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndLXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 151fcc7397dSJunchao Zhang PetscErrorCode (*da_ScatterAndBXOR) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*); 152fcc7397dSJunchao Zhang PetscErrorCode (*da_FetchAndAddLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*); 1537fd2d3dbSJunchao Zhang #if defined (PETSC_HAVE_CUDA) 154e315309dSJunchao Zhang PetscInt maxResidentThreadsPerGPU; /* It is a copy from SF for convenience */ 155eb02082bSJunchao Zhang cudaStream_t stream; /* Stream to launch pack/unapck kernels if not using the default stream */ 1567fd2d3dbSJunchao Zhang #elif defined (PETSC_HAVE_HIP) 1577fd2d3dbSJunchao Zhang hipStream_t stream; 158eb02082bSJunchao Zhang #endif 1597fd2d3dbSJunchao Zhang 160f4af43b4SJunchao Zhang PetscErrorCode (*Destroy)(PetscSFLink); /* These two fields are meant to be used by SF_Kokkos, with spptr pointing to an execution space object */ 161f4af43b4SJunchao Zhang void *spptr; /* for a given stream, but unused now due to a Kokkos bug, so that SF_Kokkos only supports null stream. */ 1627fd2d3dbSJunchao Zhang #endif 1637fd2d3dbSJunchao Zhang 164eb02082bSJunchao Zhang PetscMPIInt tag; /* Each link has a tag so we can perform multiple SF ops at the same time */ 165cd620004SJunchao Zhang MPI_Datatype unit; /* The MPI datatype this PetscSFLink is built for */ 166eb02082bSJunchao Zhang MPI_Datatype basicunit; /* unit is made of MPI builtin dataype basicunit */ 167e07844bfSJunchao Zhang PetscBool isbuiltin; /* Is unit an MPI/PETSc builtin datatype? If it is true, then bs=1 and basicunit is equivalent to unit */ 168eb02082bSJunchao Zhang size_t unitbytes; /* Number of bytes in a unit */ 169eb02082bSJunchao Zhang PetscInt bs; /* Number of basic units in a unit */ 170cd620004SJunchao Zhang const void *rootdata,*leafdata; /* rootdata and leafdata the link is working on. They are used as keys for pending links. */ 171cd620004SJunchao Zhang PetscMemType rootmtype,leafmtype; /* root/leafdata's memory type */ 172cd620004SJunchao Zhang 173cd620004SJunchao Zhang /* For local and remote communication */ 174cd620004SJunchao Zhang PetscMemType rootmtype_mpi,leafmtype_mpi; /* Mtypes of buffers passed to MPI. If use_gpu_aware_mpi, they are same as root/leafmtype. Otherwise they are PETSC_MEMTYPE_HOST */ 175cd620004SJunchao Zhang PetscBool rootdirect[2],leafdirect[2]; /* Can root/leafdata be directly passed to SF (i.e., without buffering). In layout of [PETSCSF_LOCAL/REMOTE]. See more in PetscSFLinkCreate() */ 176cd620004SJunchao Zhang PetscInt rootdirect_mpi,leafdirect_mpi;/* Can root/leafdata for remote be directly passed to MPI? 1: yes, 0: no. See more in PetscSFLinkCreate() */ 177cd620004SJunchao Zhang const void *rootdatadirect[2][2]; /* The root/leafdata used to init root/leaf requests, in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE]. */ 178cd620004SJunchao Zhang const void *leafdatadirect[2][2]; /* ... We need them to look up links when root/leafdirect_mpi are true */ 179cd620004SJunchao Zhang char *rootbuf[2][2]; /* Buffers for packed roots, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 180cd620004SJunchao Zhang char *rootbuf_alloc[2][2]; /* Log memory allocated by petsc. We need it since rootbuf[][] may point to rootdata given by user */ 181cd620004SJunchao Zhang char *leafbuf[2][2]; /* Buffers for packed leaves, in layout of [PETSCSF_LOCAL/REMOTE][PETSC_MEMTYPE] */ 182cd620004SJunchao Zhang char *leafbuf_alloc[2][2]; 183cd620004SJunchao Zhang MPI_Request *rootreqs[2][2][2]; /* Root requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi] */ 184cd620004SJunchao Zhang MPI_Request *leafreqs[2][2][2]; /* Leaf requests in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi] */ 185cd620004SJunchao Zhang PetscBool rootreqsinited[2][2][2]; /* Are root requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][rootdirect_mpi]*/ 186cd620004SJunchao Zhang PetscBool leafreqsinited[2][2][2]; /* Are leaf requests initialized? Also in layout of [PETSCSF_DIRECTION][PETSC_MEMTYPE][leafdirect_mpi]*/ 187cd620004SJunchao Zhang MPI_Request *reqs; /* An array of length (nrootreqs+nleafreqs)*8. Pointers in rootreqs[][][] and leafreqs[][][] point here */ 188cd620004SJunchao Zhang PetscSFLink next; 18940e23c03SJunchao Zhang }; 19040e23c03SJunchao Zhang 191cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF,MPI_Datatype,const void*,const void*); 192b7c0d12aSJunchao Zhang 193cd620004SJunchao Zhang /* Create/setup/retrieve/destroy a link */ 194cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkCreate(PetscSF,MPI_Datatype,PetscMemType,const void*,PetscMemType,const void*,MPI_Op,PetscSFOperation,PetscSFLink*); 195cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Host(PetscSF,PetscSFLink,MPI_Datatype); 196cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetInUse(PetscSF,MPI_Datatype,const void*,const void*,PetscCopyMode,PetscSFLink*); 197cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReclaim(PetscSF,PetscSFLink*); 198cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkDestroy(PetscSF,PetscSFLink*); 199cd620004SJunchao Zhang 200cd620004SJunchao Zhang /* Get pack/unpack function pointers from a link */ 201fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetPack(PetscSFLink link,PetscMemType mtype,PetscErrorCode (**Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)) 202eb02082bSJunchao Zhang { 203eb02082bSJunchao Zhang PetscFunctionBegin; 204eb02082bSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) *Pack = link->h_Pack; 2057fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 206cd620004SJunchao Zhang else *Pack = link->d_Pack; 207eb02082bSJunchao Zhang #endif 208eb02082bSJunchao Zhang PetscFunctionReturn(0); 209eb02082bSJunchao Zhang } 2107fd2d3dbSJunchao Zhang 2117fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkMPIWaitall(PetscSF sf,PetscSFLink link,PetscSFDirection direction) 2127fd2d3dbSJunchao Zhang { 2137fd2d3dbSJunchao Zhang PetscErrorCode ierr; 2147fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 2157fd2d3dbSJunchao Zhang const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; 2167fd2d3dbSJunchao Zhang const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi; 2177fd2d3dbSJunchao Zhang 2187fd2d3dbSJunchao Zhang PetscFunctionBegin; 2197fd2d3dbSJunchao Zhang ierr = MPI_Waitall(bas->nrootreqs,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 2207fd2d3dbSJunchao Zhang ierr = MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi],MPI_STATUSES_IGNORE);CHKERRQ(ierr); 2217fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 2227fd2d3dbSJunchao Zhang } 2237fd2d3dbSJunchao Zhang 224fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*)); 225fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOp (PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOp) (PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*)); 226fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*)); 227fcc7397dSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink,PetscMemType,MPI_Op,PetscBool,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*)); 228cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF,PetscSFLink,PetscSFDirection,void**,void**,MPI_Request**,MPI_Request**); 229b7c0d12aSJunchao Zhang 230cd620004SJunchao Zhang /* Do Pack/Unpack/Fetch/Scatter with the link */ 231cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackRootData (PetscSF,PetscSFLink,PetscSFScope,const void*); 232cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkPackLeafData (PetscSF,PetscSFLink,PetscSFScope,const void*); 233cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackRootData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 234cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 235cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchRootData (PetscSF,PetscSFLink,PetscSFScope,void*,MPI_Op); 236cd620004SJunchao Zhang 237cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 238cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkReduceLocal(PetscSF,PetscSFLink,const void*,void*,MPI_Op); 239cd620004SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF,PetscSFLink,void*,const void*,void*,MPI_Op); 240cd620004SJunchao Zhang 2417fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFSetUpPackFields(PetscSF); 2427fd2d3dbSJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFResetPackFields(PetscSF); 2437fd2d3dbSJunchao Zhang 24420c24465SJunchao Zhang #if defined(PETSC_HAVE_CUDA) 24520c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Cuda(PetscSF,PetscSFLink,MPI_Datatype); 24620c24465SJunchao Zhang #endif 24720c24465SJunchao Zhang 24820c24465SJunchao Zhang #if defined(PETSC_HAVE_KOKKOS) 24920c24465SJunchao Zhang PETSC_INTERN PetscErrorCode PetscSFLinkSetUp_Kokkos(PetscSF,PetscSFLink,MPI_Datatype); 25020c24465SJunchao Zhang #endif 25120c24465SJunchao Zhang 2527fd2d3dbSJunchao Zhang /* A set of helper routines for Pack/Unpack/Scatter on GPUs */ 2537fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 2547fd2d3dbSJunchao Zhang /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to 2557fd2d3dbSJunchao Zhang make sure the data is ready for packing. 2567fd2d3dbSJunchao Zhang */ 2577fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link) 2587fd2d3dbSJunchao Zhang { 2597fd2d3dbSJunchao Zhang PetscErrorCode ierr; 2607fd2d3dbSJunchao Zhang PetscFunctionBegin; 2617fd2d3dbSJunchao Zhang if (sf->use_default_stream) PetscFunctionReturn(0); 26220c24465SJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {ierr = (*link->d_SyncDevice)(link);CHKERRQ(ierr);} 2637fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 2647fd2d3dbSJunchao Zhang } 2657fd2d3dbSJunchao Zhang 2667fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */ 2677fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link) 2687fd2d3dbSJunchao Zhang { 2697fd2d3dbSJunchao Zhang PetscErrorCode ierr; 2707fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 2717fd2d3dbSJunchao Zhang 2727fd2d3dbSJunchao Zhang PetscFunctionBegin; 2737fd2d3dbSJunchao Zhang /* Do nothing if we use stream aware mpi || has nothing for remote */ 2747fd2d3dbSJunchao Zhang if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 2757fd2d3dbSJunchao Zhang /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */ 27620c24465SJunchao Zhang if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 2777fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 2787fd2d3dbSJunchao Zhang } 2797fd2d3dbSJunchao Zhang 2807fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link) 2817fd2d3dbSJunchao Zhang { 2827fd2d3dbSJunchao Zhang PetscErrorCode ierr; 2837fd2d3dbSJunchao Zhang PetscFunctionBegin; 2847fd2d3dbSJunchao Zhang /* See comments above */ 2857fd2d3dbSJunchao Zhang if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) PetscFunctionReturn(0); 28620c24465SJunchao Zhang if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 2877fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 2887fd2d3dbSJunchao Zhang } 2897fd2d3dbSJunchao Zhang 2907fd2d3dbSJunchao Zhang /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF 2917fd2d3dbSJunchao Zhang does not know which stream the callers will use. 2927fd2d3dbSJunchao Zhang */ 2937fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link) 2947fd2d3dbSJunchao Zhang { 2957fd2d3dbSJunchao Zhang PetscErrorCode ierr; 2967fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 2977fd2d3dbSJunchao Zhang PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 2987fd2d3dbSJunchao Zhang 2997fd2d3dbSJunchao Zhang PetscFunctionBegin; 3007fd2d3dbSJunchao Zhang /* Do nothing if host2host OR we are allowed to asynchronously put rootdata on device through the default stream */ 3017fd2d3dbSJunchao Zhang if (host2host || (link->rootmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 3027fd2d3dbSJunchao Zhang 3037fd2d3dbSJunchao Zhang /* If rootmtype is HOST or DEVICE: 3047fd2d3dbSJunchao Zhang If we have data from local, then we called a scatter kernel (on link->stream), then we must sync it; 3057fd2d3dbSJunchao Zhang If we have data from remote && no rootdirect(i.e., we called an unpack kernel), then we must also sycn it (if rootdirect, 3067fd2d3dbSJunchao Zhang i.e., no unpack kernel after MPI, MPI guarentees rootbuf is ready to use so that we do not need the sync). 3077fd2d3dbSJunchao Zhang 3087fd2d3dbSJunchao Zhang Note a tricky case is when leafmtype=DEVICE, rootmtype=HOST on uni-processor, we must sync the stream otherwise 3097fd2d3dbSJunchao Zhang CPU thread might use the yet-to-be-updated rootdata pending in the stream. 3107fd2d3dbSJunchao Zhang */ 31120c24465SJunchao Zhang if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 3127fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 3137fd2d3dbSJunchao Zhang } 3147fd2d3dbSJunchao Zhang 3157fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link) 3167fd2d3dbSJunchao Zhang { 3177fd2d3dbSJunchao Zhang PetscErrorCode ierr; 3187fd2d3dbSJunchao Zhang PetscBool host2host = (link->rootmtype == PETSC_MEMTYPE_HOST) && (link->leafmtype == PETSC_MEMTYPE_HOST) ? PETSC_TRUE : PETSC_FALSE; 3197fd2d3dbSJunchao Zhang 3207fd2d3dbSJunchao Zhang PetscFunctionBegin; 3217fd2d3dbSJunchao Zhang /* See comments in PetscSFLinkSyncStreamAfterUnpackRootData*/ 3227fd2d3dbSJunchao Zhang if (host2host || (link->leafmtype == PETSC_MEMTYPE_DEVICE && sf->use_default_stream)) PetscFunctionReturn(0); 32320c24465SJunchao Zhang if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {ierr = (*link->d_SyncStream)(link);CHKERRQ(ierr);} 3247fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 3257fd2d3dbSJunchao Zhang } 3267fd2d3dbSJunchao Zhang 3277fd2d3dbSJunchao Zhang /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need 3287fd2d3dbSJunchao Zhang to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls. 3297fd2d3dbSJunchao Zhang */ 3307fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 3317fd2d3dbSJunchao Zhang { 3327fd2d3dbSJunchao Zhang PetscErrorCode ierr; 3337fd2d3dbSJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 3347fd2d3dbSJunchao Zhang 3357fd2d3dbSJunchao Zhang PetscFunctionBegin; 3367fd2d3dbSJunchao Zhang if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) { 3377fd2d3dbSJunchao Zhang void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 3387fd2d3dbSJunchao Zhang void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 3397fd2d3dbSJunchao Zhang size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes; 3407fd2d3dbSJunchao Zhang if (device2host) { 34120c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 3427fd2d3dbSJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 3437fd2d3dbSJunchao Zhang } else { 34420c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 3457fd2d3dbSJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 3467fd2d3dbSJunchao Zhang } 3477fd2d3dbSJunchao Zhang } 3487fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 3497fd2d3dbSJunchao Zhang } 3507fd2d3dbSJunchao Zhang 3517fd2d3dbSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host) 3527fd2d3dbSJunchao Zhang { 3537fd2d3dbSJunchao Zhang PetscErrorCode ierr; 3547fd2d3dbSJunchao Zhang 3557fd2d3dbSJunchao Zhang PetscFunctionBegin; 3567fd2d3dbSJunchao Zhang if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) { 3577fd2d3dbSJunchao Zhang void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]; 3587fd2d3dbSJunchao Zhang void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE]; 3597fd2d3dbSJunchao Zhang size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes; 3607fd2d3dbSJunchao Zhang if (device2host) { 36120c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_HOST,h_buf,PETSC_MEMTYPE_DEVICE,d_buf,count);CHKERRQ(ierr); 3627fd2d3dbSJunchao Zhang ierr = PetscLogGpuToCpu(count);CHKERRQ(ierr); 3637fd2d3dbSJunchao Zhang } else { 36420c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,d_buf,PETSC_MEMTYPE_HOST,h_buf,count);CHKERRQ(ierr); 3657fd2d3dbSJunchao Zhang ierr = PetscLogCpuToGpu(count);CHKERRQ(ierr); 3667fd2d3dbSJunchao Zhang } 3677fd2d3dbSJunchao Zhang } 3687fd2d3dbSJunchao Zhang PetscFunctionReturn(0); 3697fd2d3dbSJunchao Zhang } 3707fd2d3dbSJunchao Zhang 3717fd2d3dbSJunchao Zhang #else /* Host only */ 3727fd2d3dbSJunchao Zhang #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0 3737fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0 3747fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0 3757fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0 3767fd2d3dbSJunchao Zhang #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0 3777fd2d3dbSJunchao Zhang #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 3787fd2d3dbSJunchao Zhang #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0 3797fd2d3dbSJunchao Zhang #endif 380cd620004SJunchao Zhang 381cd620004SJunchao Zhang /* Get root indices used for pack/unpack 382cd620004SJunchao Zhang 383cd620004SJunchao Zhang Input arguments: 384cd620004SJunchao Zhang +sf - StarForest 385cd620004SJunchao Zhang .link - The link, which provides the stream for the async memcpy (In SF, we make all GPU operations asynchronous to avoid unexpected pipeline stalls) 386cd620004SJunchao Zhang .scope - Which part of the indices? (PETSCSF_LOCAL or PETSCSF_REMOTE) 387cd620004SJunchao Zhang .mtype - In what type of memory? (PETSC_MEMTYPE_DEVICE or PETSC_MEMTYPE_HOST) 388cd620004SJunchao Zhang 389cd620004SJunchao Zhang Output arguments: 390cd620004SJunchao Zhang +count - Count of indices 391cd620004SJunchao Zhang .start - The first index (only useful when indices is NULL) 392cd620004SJunchao Zhang -indices - indices of roots for pack/unpack. NULL means indices are contiguous 393cd620004SJunchao Zhang */ 394fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetRootPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 395b7c0d12aSJunchao Zhang { 396cd620004SJunchao Zhang PetscSF_Basic *bas = (PetscSF_Basic*)sf->data; 397cd620004SJunchao Zhang PetscInt offset; 398b7c0d12aSJunchao Zhang 399b7c0d12aSJunchao Zhang PetscFunctionBegin; 400fcc7397dSJunchao Zhang *count = bas->rootbuflen[scope]; 401fcc7397dSJunchao Zhang *start = bas->rootstart[scope]; 402fcc7397dSJunchao Zhang *opt = NULL; 403fcc7397dSJunchao Zhang *indices = NULL; 404fcc7397dSJunchao Zhang 405fcc7397dSJunchao Zhang /* We have these rules: 406fcc7397dSJunchao Zhang 1) opt == NULL && indices == NULL ==> indices are contiguous. 407fcc7397dSJunchao Zhang 2) opt != NULL ==> indices are in 3D but not contiguous. On host, indices != NULL since indices are already available and we do not 408fcc7397dSJunchao Zhang want to enforce all operations to use opt; but on device, indices = NULL since we do not want to copy indices to device. 409fcc7397dSJunchao Zhang */ 410fcc7397dSJunchao Zhang if (!bas->rootcontig[scope]) { 411cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : bas->ioffset[bas->ndiranks]; 412fcc7397dSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*opt = bas->rootpackopt[scope]; *indices = bas->irootloc + offset;} 4137fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 414cd620004SJunchao Zhang else { 415fcc7397dSJunchao Zhang PetscErrorCode ierr; 416fcc7397dSJunchao Zhang size_t size; 417fcc7397dSJunchao Zhang if (bas->rootpackopt[scope]) { 418fcc7397dSJunchao Zhang if (!bas->rootpackopt_d[scope]) { 419fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&bas->rootpackopt_d[scope]);CHKERRQ(ierr); 420fcc7397dSJunchao Zhang ierr = PetscArraycpy(bas->rootpackopt_d[scope],bas->rootpackopt[scope],1);CHKERRQ(ierr); /* Make pointers in bas->rootpackopt_d[] still work on host */ 421fcc7397dSJunchao Zhang size = (bas->rootpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 42220c24465SJunchao Zhang ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->rootpackopt_d[scope]->array);CHKERRQ(ierr); 42320c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->rootpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,bas->rootpackopt[scope]->array,size);CHKERRQ(ierr); 424fcc7397dSJunchao Zhang } 425fcc7397dSJunchao Zhang *opt = bas->rootpackopt_d[scope]; 426fcc7397dSJunchao Zhang } else { /* On device, we only provide indices when there is no optimization. We're reluctant to copy indices to device. */ 427fcc7397dSJunchao Zhang if (!bas->irootloc_d[scope]) { 428fcc7397dSJunchao Zhang size = bas->rootbuflen[scope]*sizeof(PetscInt); 42920c24465SJunchao Zhang ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&bas->irootloc_d[scope]);CHKERRQ(ierr); 43020c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,bas->irootloc_d[scope],PETSC_MEMTYPE_HOST,bas->irootloc+offset,size);CHKERRQ(ierr); 431b7c0d12aSJunchao Zhang } 432cd620004SJunchao Zhang *indices = bas->irootloc_d[scope]; 433cd620004SJunchao Zhang } 434cd620004SJunchao Zhang } 435fcc7397dSJunchao Zhang #endif 436cd620004SJunchao Zhang } 437b7c0d12aSJunchao Zhang PetscFunctionReturn(0); 438b7c0d12aSJunchao Zhang } 439b7c0d12aSJunchao Zhang 440cd620004SJunchao Zhang /* Get leaf indices used for pack/unpack 441cd620004SJunchao Zhang 442fcc7397dSJunchao Zhang See also PetscSFLinkGetRootPackOptAndIndices() 443cd620004SJunchao Zhang */ 444fcc7397dSJunchao Zhang PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkGetLeafPackOptAndIndices(PetscSF sf,PetscSFLink link,PetscMemType mtype,PetscSFScope scope,PetscInt *count,PetscInt *start,PetscSFPackOpt *opt,const PetscInt **indices) 445cd620004SJunchao Zhang { 446cd620004SJunchao Zhang PetscInt offset; 447cd620004SJunchao Zhang 448cd620004SJunchao Zhang PetscFunctionBegin; 449fcc7397dSJunchao Zhang *count = sf->leafbuflen[scope]; 450fcc7397dSJunchao Zhang *start = sf->leafstart[scope]; 451fcc7397dSJunchao Zhang *opt = NULL; 452fcc7397dSJunchao Zhang *indices = NULL; 453fcc7397dSJunchao Zhang if (!sf->leafcontig[scope]) { 454cd620004SJunchao Zhang offset = (scope == PETSCSF_LOCAL)? 0 : sf->roffset[sf->ndranks]; 455fcc7397dSJunchao Zhang if (mtype == PETSC_MEMTYPE_HOST) {*opt = sf->leafpackopt[scope]; *indices = sf->rmine + offset;} 4567fd2d3dbSJunchao Zhang #if defined(PETSC_HAVE_DEVICE) 457cd620004SJunchao Zhang else { 458fcc7397dSJunchao Zhang PetscErrorCode ierr; 459fcc7397dSJunchao Zhang size_t size; 460fcc7397dSJunchao Zhang if (sf->leafpackopt[scope]) { 461fcc7397dSJunchao Zhang if (!sf->leafpackopt_d[scope]) { 462fcc7397dSJunchao Zhang ierr = PetscMalloc1(1,&sf->leafpackopt_d[scope]);CHKERRQ(ierr); 463fcc7397dSJunchao Zhang ierr = PetscArraycpy(sf->leafpackopt_d[scope],sf->leafpackopt[scope],1);CHKERRQ(ierr); 464fcc7397dSJunchao Zhang size = (sf->leafpackopt[scope]->n*7+2)*sizeof(PetscInt); /* See comments at struct _n_PetscSFPackOpt*/ 46520c24465SJunchao Zhang ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->leafpackopt_d[scope]->array);CHKERRQ(ierr); /* Change ->array to a device pointer */ 46620c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->leafpackopt_d[scope]->array,PETSC_MEMTYPE_HOST,sf->leafpackopt[scope]->array,size);CHKERRQ(ierr); 467fcc7397dSJunchao Zhang } 468fcc7397dSJunchao Zhang *opt = sf->leafpackopt_d[scope]; 469fcc7397dSJunchao Zhang } else { 470fcc7397dSJunchao Zhang if (!sf->rmine_d[scope]) { 471fcc7397dSJunchao Zhang size = sf->leafbuflen[scope]*sizeof(PetscInt); 47220c24465SJunchao Zhang ierr = PetscSFMalloc(sf,PETSC_MEMTYPE_DEVICE,size,(void **)&sf->rmine_d[scope]);CHKERRQ(ierr); 47320c24465SJunchao Zhang ierr = (*link->Memcpy)(link,PETSC_MEMTYPE_DEVICE,sf->rmine_d[scope],PETSC_MEMTYPE_HOST,sf->rmine+offset,size);CHKERRQ(ierr); 474cd620004SJunchao Zhang } 475cd620004SJunchao Zhang *indices = sf->rmine_d[scope]; 476cd620004SJunchao Zhang } 477cd620004SJunchao Zhang } 478fcc7397dSJunchao Zhang #endif 479cd620004SJunchao Zhang } 480cd620004SJunchao Zhang PetscFunctionReturn(0); 481cd620004SJunchao Zhang } 48240e23c03SJunchao Zhang #endif 483