1 static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n"; 2 3 /* 4 This is a simple test to measure the latency of MPI communication. 5 The test is run with two processes. The first process sends a message 6 to the second process, and after having received the message, the second 7 process sends a message back to the first process once. The is repeated 8 a number of times. The latency is defined as half time of the round-trip. 9 10 It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/). 11 12 Usage: mpirun -n 2 ./ex18 -mtype <type> 13 Other arguments have a default value that is also used in osu_latency. 14 15 Examples: 16 17 On Summit at OLCF: 18 jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18 -mtype cuda 19 20 On Crusher at OLCF: 21 srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip 22 */ 23 24 #include <petscsf.h> 25 #include <petscdevice.h> 26 #if defined(PETSC_HAVE_UNISTD_H) 27 #include <unistd.h> 28 #endif 29 30 /* Same values as OSU microbenchmarks */ 31 #define LAT_LOOP_SMALL 10000 32 #define LAT_SKIP_SMALL 100 33 #define LAT_LOOP_LARGE 1000 34 #define LAT_SKIP_LARGE 10 35 #define LARGE_MESSAGE_SIZE 8192 36 37 static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr) 38 { 39 PetscFunctionBegin; 40 if (PetscMemTypeHost(mtype)) { 41 #if defined(PETSC_HAVE_GETPAGESIZE) 42 PetscCall(posix_memalign(ptr,getpagesize(),size)); 43 #else 44 PetscCall(PetscMalloc(size,ptr)); 45 #endif 46 } 47 #if defined(PETSC_HAVE_CUDA) 48 else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMalloc(ptr,size)); 49 #elif defined(PETSC_HAVE_HIP) 50 else if (PetscMemTypeHIP(mtype)) PetscCallHIP(hipMalloc(ptr,size)); 51 #endif 52 PetscFunctionReturn(0); 53 } 54 55 static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr) 56 { 57 PetscFunctionBegin; 58 if (PetscMemTypeHost(mtype)) {free(ptr);} 59 #if defined(PETSC_HAVE_CUDA) 60 else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaFree(ptr)); 61 #elif defined(PETSC_HAVE_HIP) 62 else if (PetscMemTypeHIP(mtype)) PetscCallHIP(hipFree(ptr)); 63 #endif 64 PetscFunctionReturn(0); 65 } 66 67 /* Free memory and set ptr to NULL when succeeded */ 68 #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0))) 69 70 static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n) 71 { 72 PetscFunctionBegin; 73 if (PetscMemTypeHost(mtype)) PetscCall(PetscMemcpy(dst,src,n)); 74 #if defined(PETSC_HAVE_CUDA) 75 else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice)); 76 #elif defined(PETSC_HAVE_HIP) 77 else if (PetscMemTypeHIP(mtype)) PetscCallHIP(hipMemcpy(dst,src,n,hipMemcpyHostToDevice)); 78 #endif 79 PetscFunctionReturn(0); 80 } 81 82 int main(int argc,char **argv) 83 { 84 PetscSF sf[64]; 85 PetscLogDouble t_start=0,t_end=0,time[64]; 86 PetscInt i,j,n,nroots,nleaves,niter=100,nskip=10; 87 PetscInt maxn=512*1024; /* max 4M bytes messages */ 88 PetscSFNode *iremote; 89 PetscMPIInt rank,size; 90 PetscScalar *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf; 91 size_t msgsize; 92 PetscMemType mtype = PETSC_MEMTYPE_HOST; 93 char mstring[16]={0}; 94 PetscBool isCuda,isHip,isHost,set; 95 PetscInt skipSmall=-1,loopSmall=-1; 96 MPI_Op op = MPI_REPLACE; 97 98 PetscCall(PetscInitialize(&argc,&argv,NULL,help)); 99 /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */ 100 #if defined(PETSC_HAVE_CUDA) 101 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 102 #elif defined(PETSC_HAVE_HIP) 103 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP)); 104 #endif 105 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 106 PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD,&rank)); 107 PetscCheck(size == 2,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"Must run with 2 processes"); 108 109 PetscCall(PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL)); /* maxn PetscScalars */ 110 PetscCall(PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL)); 111 PetscCall(PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL)); 112 113 PetscCall(PetscMalloc1(maxn,&iremote)); 114 PetscCall(PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set)); 115 if (set) { 116 PetscCall(PetscStrcasecmp(mstring,"cuda",&isCuda)); 117 PetscCall(PetscStrcasecmp(mstring,"hip",&isHip)); 118 PetscCall(PetscStrcasecmp(mstring,"host",&isHost)); 119 120 if (isHost) mtype = PETSC_MEMTYPE_HOST; 121 else if (isCuda) mtype = PETSC_MEMTYPE_CUDA; 122 else if (isHip) mtype = PETSC_MEMTYPE_HIP; 123 else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring); 124 } 125 126 PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata)); 127 PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata)); 128 129 PetscCall(PetscMalloc2(maxn,&pbuf,maxn,&ebuf)); 130 for (i=0; i<maxn; i++) { 131 pbuf[i] = 123.0; 132 ebuf[i] = 456.0; 133 } 134 135 for (n=1,i=0; n<=maxn; n*=2,i++) { 136 PetscCall(PetscSFCreate(PETSC_COMM_WORLD,&sf[i])); 137 PetscCall(PetscSFSetFromOptions(sf[i])); 138 if (rank == 0) { 139 nroots = n; 140 nleaves = 0; 141 } else { 142 nroots = 0; 143 nleaves = n; 144 for (j=0; j<nleaves; j++) { 145 iremote[j].rank = 0; 146 iremote[j].index = j; 147 } 148 } 149 PetscCall(PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES)); 150 PetscCall(PetscSFSetUp(sf[i])); 151 } 152 153 if (loopSmall > 0) { 154 nskip = skipSmall; 155 niter = loopSmall; 156 } else { 157 nskip = LAT_SKIP_SMALL; 158 niter = LAT_LOOP_SMALL; 159 } 160 161 for (n=1,j=0; n<=maxn; n*=2,j++) { 162 msgsize = sizeof(PetscScalar)*n; 163 PetscCall(PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize)); 164 PetscCall(PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize)); 165 166 if (msgsize > LARGE_MESSAGE_SIZE) { 167 nskip = LAT_SKIP_LARGE; 168 niter = LAT_LOOP_LARGE; 169 } 170 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 171 172 for (i=0; i<niter + nskip; i++) { 173 if (i == nskip) { 174 #if defined(PETSC_HAVE_CUDA) 175 PetscCallCUDA(cudaDeviceSynchronize()); 176 #elif defined(PETSC_HAVE_HIP) 177 PetscCallHIP(hipDeviceSynchronize()); 178 #endif 179 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 180 t_start = MPI_Wtime(); 181 } 182 PetscCall(PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op)); 183 PetscCall(PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op)); 184 PetscCall(PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op)); 185 PetscCall(PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op)); 186 } 187 #if defined(PETSC_HAVE_CUDA) 188 PetscCallCUDA(cudaDeviceSynchronize()); 189 #elif defined(PETSC_HAVE_HIP) 190 PetscCallHIP(hipDeviceSynchronize()); 191 #endif 192 PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD)); 193 t_end = MPI_Wtime(); 194 time[j] = (t_end - t_start)*1e6 / (niter*2); 195 } 196 197 PetscCall(PetscPrintf(PETSC_COMM_WORLD,"\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device")); 198 for (n=1,j=0; n<=maxn; n*=2,j++) { 199 PetscCall(PetscSFDestroy(&sf[j])); 200 PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%16" PetscInt_FMT " \t %16.4f\n",sizeof(PetscScalar)*n,time[j])); 201 } 202 203 PetscCall(PetscFree2(pbuf,ebuf)); 204 PetscCall(PetscFreeWithMemType(mtype,rootdata)); 205 PetscCall(PetscFreeWithMemType(mtype,leafdata)); 206 PetscCall(PetscFree(iremote)); 207 PetscCall(PetscFinalize()); 208 return 0; 209 } 210 211 /**TEST 212 testset: 213 # use small numbers to make the test cheap 214 args: -maxn 4 -skipSmall 1 -loopSmall 1 215 filter: grep "DOES_NOT_EXIST" 216 output_file: output/empty.out 217 nsize: 2 218 219 test: 220 args: -mtype host 221 222 test: 223 requires: cuda 224 args: -mtype cuda 225 226 test: 227 requires: hip 228 args: -mtype hip 229 TEST**/ 230