1 static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n"; 2 3 /* 4 This is a simple test to measure the latency of MPI communication. 5 The test is run with two processes. The first process sends a message 6 to the second process, and after having received the message, the second 7 process sends a message back to the first process once. The is repeated 8 a number of times. The latency is defined as half time of the round-trip. 9 10 It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/). 11 12 Usage: mpirun -n 2 ./ex18 -mtype <type> 13 Other arguments have a default value that is also used in osu_latency. 14 15 Examples: 16 17 On Summit at OLCF: 18 jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18 -mtype cuda 19 20 On Crusher at OLCF: 21 srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip 22 */ 23 24 #include <petscsf.h> 25 #include <petscdevice.h> 26 #if defined(PETSC_HAVE_UNISTD_H) 27 #include <unistd.h> 28 #endif 29 30 /* Same values as OSU microbenchmarks */ 31 #define LAT_LOOP_SMALL 10000 32 #define LAT_SKIP_SMALL 100 33 #define LAT_LOOP_LARGE 1000 34 #define LAT_SKIP_LARGE 10 35 #define LARGE_MESSAGE_SIZE 8192 36 37 static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr) 38 { 39 PetscErrorCode ierr; 40 41 PetscFunctionBegin; 42 if (PetscMemTypeHost(mtype)) { 43 #if defined(PETSC_HAVE_GETPAGESIZE) 44 ierr = posix_memalign(ptr,getpagesize(),size);CHKERRQ(ierr); 45 #else 46 ierr = PetscMalloc(size,ptr);CHKERRQ(ierr); 47 #endif 48 } 49 #if defined(PETSC_HAVE_CUDA) 50 else if (PetscMemTypeCUDA(mtype)) {cudaError_t cerr = cudaMalloc(ptr,size);CHKERRCUDA(cerr);} 51 #elif defined(PETSC_HAVE_HIP) 52 else if (PetscMemTypeHIP(mtype)) {hipError_t cerr = hipMalloc(ptr,size);CHKERRHIP(cerr);} 53 #endif 54 PetscFunctionReturn(0); 55 } 56 57 static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr) 58 { 59 PetscFunctionBegin; 60 if (PetscMemTypeHost(mtype)) {free(ptr);} 61 #if defined(PETSC_HAVE_CUDA) 62 else if (PetscMemTypeCUDA(mtype)) {cudaError_t cerr = cudaFree(ptr);CHKERRCUDA(cerr);} 63 #elif defined(PETSC_HAVE_HIP) 64 else if (PetscMemTypeHIP(mtype)) {hipError_t cerr = hipFree(ptr);CHKERRHIP(cerr);} 65 #endif 66 PetscFunctionReturn(0); 67 } 68 69 /* Free memory and set ptr to NULL when succeeded */ 70 #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0))) 71 72 static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n) 73 { 74 PetscErrorCode ierr; 75 76 PetscFunctionBegin; 77 if (PetscMemTypeHost(mtype)) {ierr = PetscMemcpy(dst,src,n);CHKERRQ(ierr);} 78 #if defined(PETSC_HAVE_CUDA) 79 else if (PetscMemTypeCUDA(mtype)) {cudaError_t cerr = cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);} 80 #elif defined(PETSC_HAVE_HIP) 81 else if (PetscMemTypeHIP(mtype)) {hipError_t cerr = hipMemcpy(dst,src,n,hipMemcpyHostToDevice);CHKERRHIP(cerr);} 82 #endif 83 PetscFunctionReturn(0); 84 } 85 86 int main(int argc,char **argv) 87 { 88 PetscErrorCode ierr; 89 PetscSF sf[64]; 90 PetscLogDouble t_start=0,t_end=0,time[64]; 91 PetscInt i,j,n,nroots,nleaves,niter=100,nskip=10; 92 PetscInt maxn=512*1024; /* max 4M bytes messages */ 93 PetscSFNode *iremote; 94 PetscMPIInt rank,size; 95 PetscScalar *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf; 96 size_t msgsize; 97 PetscMemType mtype = PETSC_MEMTYPE_HOST; 98 char mstring[16]={0}; 99 PetscBool isCuda,isHip,isHost,set; 100 PetscInt skipSmall=-1,loopSmall=-1; 101 MPI_Op op = MPI_REPLACE; 102 103 ierr = PetscInitialize(&argc,&argv,NULL,help);if (ierr) return ierr; 104 /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */ 105 #if defined(PETSC_HAVE_CUDA) 106 ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 107 #elif defined(PETSC_HAVE_HIP) 108 ierr = PetscDeviceInitialize(PETSC_DEVICE_HIP);CHKERRQ(ierr); 109 #endif 110 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); 111 ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRMPI(ierr); 112 PetscCheck(size == 2,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"Must run with 2 processes"); 113 114 ierr = PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL);CHKERRQ(ierr); /* maxn PetscScalars */ 115 ierr = PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL);CHKERRQ(ierr); 116 ierr = PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL);CHKERRQ(ierr); 117 118 ierr = PetscMalloc1(maxn,&iremote);CHKERRQ(ierr); 119 ierr = PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set);CHKERRQ(ierr); 120 if (set) { 121 ierr = PetscStrcasecmp(mstring,"cuda",&isCuda);CHKERRQ(ierr); 122 ierr = PetscStrcasecmp(mstring,"hip",&isHip);CHKERRQ(ierr); 123 ierr = PetscStrcasecmp(mstring,"host",&isHost);CHKERRQ(ierr); 124 125 if (isHost) mtype = PETSC_MEMTYPE_HOST; 126 else if (isCuda) mtype = PETSC_MEMTYPE_CUDA; 127 else if (isHip) mtype = PETSC_MEMTYPE_HIP; 128 else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring); 129 } 130 131 ierr = PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata);CHKERRQ(ierr); 132 ierr = PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata);CHKERRQ(ierr); 133 134 ierr = PetscMalloc2(maxn,&pbuf,maxn,&ebuf);CHKERRQ(ierr); 135 for (i=0; i<maxn; i++) { 136 pbuf[i] = 123.0; 137 ebuf[i] = 456.0; 138 } 139 140 for (n=1,i=0; n<=maxn; n*=2,i++) { 141 ierr = PetscSFCreate(PETSC_COMM_WORLD,&sf[i]);CHKERRQ(ierr); 142 ierr = PetscSFSetFromOptions(sf[i]);CHKERRQ(ierr); 143 if (!rank) { 144 nroots = n; 145 nleaves = 0; 146 } else { 147 nroots = 0; 148 nleaves = n; 149 for (j=0; j<nleaves; j++) { 150 iremote[j].rank = 0; 151 iremote[j].index = j; 152 } 153 } 154 ierr = PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);CHKERRQ(ierr); 155 ierr = PetscSFSetUp(sf[i]);CHKERRQ(ierr); 156 } 157 158 if (loopSmall > 0) { 159 nskip = skipSmall; 160 niter = loopSmall; 161 } else { 162 nskip = LAT_SKIP_SMALL; 163 niter = LAT_LOOP_SMALL; 164 } 165 166 for (n=1,j=0; n<=maxn; n*=2,j++) { 167 msgsize = sizeof(PetscScalar)*n; 168 ierr = PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize);CHKERRQ(ierr); 169 ierr = PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize);CHKERRQ(ierr); 170 171 if (msgsize > LARGE_MESSAGE_SIZE) { 172 nskip = LAT_SKIP_LARGE; 173 niter = LAT_LOOP_LARGE; 174 } 175 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 176 177 for (i=0; i<niter + nskip; i++) { 178 if (i == nskip) { 179 #if defined(PETSC_HAVE_CUDA) 180 {cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);} 181 #elif defined(PETSC_HAVE_HIP) 182 {hipError_t cerr = hipDeviceSynchronize();CHKERRHIP(cerr);} 183 #endif 184 ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRMPI(ierr); 185 t_start = MPI_Wtime(); 186 } 187 ierr = PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op);CHKERRQ(ierr); 188 ierr = PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op);CHKERRQ(ierr); 189 ierr = PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op);CHKERRQ(ierr); 190 ierr = PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op);CHKERRQ(ierr); 191 } 192 #if defined(PETSC_HAVE_CUDA) 193 {cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);} 194 #elif defined(PETSC_HAVE_HIP) 195 {hipError_t cerr = hipDeviceSynchronize();CHKERRHIP(cerr);} 196 #endif 197 ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRMPI(ierr); 198 t_end = MPI_Wtime(); 199 time[j] = (t_end - t_start)*1e6 / (niter*2); 200 } 201 202 ierr = PetscPrintf(PETSC_COMM_WORLD,"\t## PetscSF Ping-pong test on %s ##\n Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device");CHKERRQ(ierr); 203 for (n=1,j=0; n<=maxn; n*=2,j++) { 204 ierr = PetscSFDestroy(&sf[j]);CHKERRQ(ierr); 205 ierr = PetscPrintf(PETSC_COMM_WORLD,"%16D \t %16.4f\n",sizeof(PetscScalar)*n,time[j]);CHKERRQ(ierr); 206 } 207 208 ierr = PetscFree2(pbuf,ebuf);CHKERRQ(ierr); 209 ierr = PetscFreeWithMemType(mtype,rootdata);CHKERRQ(ierr); 210 ierr = PetscFreeWithMemType(mtype,leafdata);CHKERRQ(ierr); 211 ierr = PetscFree(iremote);CHKERRQ(ierr); 212 ierr = PetscFinalize(); 213 return ierr; 214 } 215 216 /**TEST 217 testset: 218 # use small numbers to make the test cheap 219 args: -maxn 4 -skipSmall 1 -loopSmall 1 220 filter: grep "DOES_NOT_EXIST" 221 output_file: output/empty.out 222 nsize: 2 223 224 test: 225 args: -mtype host 226 227 test: 228 requires: cuda 229 args: -mtype cuda 230 231 test: 232 requires: hip 233 args: -mtype hip 234 TEST**/ 235