xref: /petsc/src/vec/is/sf/tests/ex20.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n";
2 
3 /*
4   This is a simple test to measure the latency of MPI communication.
5   The test is run with two processes.  The first process sends a message
6   to the second process, and after having received the message, the second
7   process sends a message back to the first process once.  The is repeated
8   a number of times.  The latency is defined as half time of the round-trip.
9 
10   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11 
12   Usage: mpirun -n 2 ./ex18 -mtype <type>
13   Other arguments have a default value that is also used in osu_latency.
14 
15   Examples:
16 
17   On Summit at OLCF:
18     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18  -mtype cuda
19 
20   On Crusher at OLCF:
21     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip
22 */
23 
24 #include <petscsf.h>
25 #include <petscdevice.h>
26 #if defined(PETSC_HAVE_UNISTD_H)
27 #include <unistd.h>
28 #endif
29 
30 /* Same values as OSU microbenchmarks */
31 #define LAT_LOOP_SMALL 10000
32 #define LAT_SKIP_SMALL 100
33 #define LAT_LOOP_LARGE 1000
34 #define LAT_SKIP_LARGE 10
35 #define LARGE_MESSAGE_SIZE 8192
36 
37 static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)
38 {
39   PetscFunctionBegin;
40   if (PetscMemTypeHost(mtype)) {
41     #if defined(PETSC_HAVE_GETPAGESIZE)
42       PetscCall(posix_memalign(ptr,getpagesize(),size));
43     #else
44       PetscCall(PetscMalloc(size,ptr));
45     #endif
46   }
47 #if defined(PETSC_HAVE_CUDA)
48   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMalloc(ptr,size));
49 #elif defined(PETSC_HAVE_HIP)
50   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipMalloc(ptr,size));
51 #endif
52   PetscFunctionReturn(0);
53 }
54 
55 static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)
56 {
57   PetscFunctionBegin;
58   if (PetscMemTypeHost(mtype)) {free(ptr);}
59 #if defined(PETSC_HAVE_CUDA)
60   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaFree(ptr));
61 #elif defined(PETSC_HAVE_HIP)
62   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipFree(ptr));
63 #endif
64   PetscFunctionReturn(0);
65 }
66 
67 /* Free memory and set ptr to NULL when succeeded */
68 #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))
69 
70 static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n)
71 {
72   PetscFunctionBegin;
73   if (PetscMemTypeHost(mtype)) PetscCall(PetscMemcpy(dst,src,n));
74 #if defined(PETSC_HAVE_CUDA)
75   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice));
76 #elif defined(PETSC_HAVE_HIP)
77   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipMemcpy(dst,src,n,hipMemcpyHostToDevice));
78 #endif
79   PetscFunctionReturn(0);
80 }
81 
82 int main(int argc,char **argv)
83 {
84   PetscSF           sf[64];
85   PetscLogDouble    t_start=0,t_end=0,time[64];
86   PetscInt          i,j,n,nroots,nleaves,niter=100,nskip=10;
87   PetscInt          maxn=512*1024; /* max 4M bytes messages */
88   PetscSFNode       *iremote;
89   PetscMPIInt       rank,size;
90   PetscScalar       *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf;
91   size_t            msgsize;
92   PetscMemType      mtype = PETSC_MEMTYPE_HOST;
93   char              mstring[16]={0};
94   PetscBool         isCuda,isHip,isHost,set;
95   PetscInt          skipSmall=-1,loopSmall=-1;
96   MPI_Op            op = MPI_REPLACE;
97 
98   PetscCall(PetscInitialize(&argc,&argv,NULL,help));
99   /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */
100 #if defined(PETSC_HAVE_CUDA)
101   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
102 #elif defined(PETSC_HAVE_HIP)
103   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
104 #endif
105   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size));
106   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD,&rank));
107   PetscCheck(size == 2,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"Must run with 2 processes");
108 
109   PetscCall(PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL)); /* maxn PetscScalars */
110   PetscCall(PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL));
111   PetscCall(PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL));
112 
113   PetscCall(PetscMalloc1(maxn,&iremote));
114   PetscCall(PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set));
115   if (set) {
116     PetscCall(PetscStrcasecmp(mstring,"cuda",&isCuda));
117     PetscCall(PetscStrcasecmp(mstring,"hip",&isHip));
118     PetscCall(PetscStrcasecmp(mstring,"host",&isHost));
119 
120     if (isHost) mtype = PETSC_MEMTYPE_HOST;
121     else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
122     else if (isHip) mtype = PETSC_MEMTYPE_HIP;
123     else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring);
124   }
125 
126   PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata));
127   PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata));
128 
129   PetscCall(PetscMalloc2(maxn,&pbuf,maxn,&ebuf));
130   for (i=0; i<maxn; i++) {
131     pbuf[i] = 123.0;
132     ebuf[i] = 456.0;
133   }
134 
135   for (n=1,i=0; n<=maxn; n*=2,i++) {
136     PetscCall(PetscSFCreate(PETSC_COMM_WORLD,&sf[i]));
137     PetscCall(PetscSFSetFromOptions(sf[i]));
138     if (rank == 0) {
139       nroots  = n;
140       nleaves = 0;
141     } else {
142       nroots  = 0;
143       nleaves = n;
144       for (j=0; j<nleaves; j++) {
145         iremote[j].rank  = 0;
146         iremote[j].index = j;
147       }
148     }
149     PetscCall(PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES));
150     PetscCall(PetscSFSetUp(sf[i]));
151   }
152 
153   if (loopSmall > 0) {
154     nskip = skipSmall;
155     niter = loopSmall;
156   } else {
157     nskip = LAT_SKIP_SMALL;
158     niter = LAT_LOOP_SMALL;
159   }
160 
161   for (n=1,j=0; n<=maxn; n*=2,j++) {
162     msgsize = sizeof(PetscScalar)*n;
163     PetscCall(PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize));
164     PetscCall(PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize));
165 
166     if (msgsize > LARGE_MESSAGE_SIZE) {
167       nskip = LAT_SKIP_LARGE;
168       niter = LAT_LOOP_LARGE;
169     }
170     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
171 
172     for (i=0; i<niter + nskip; i++) {
173       if (i == nskip) {
174        #if defined(PETSC_HAVE_CUDA)
175         PetscCallCUDA(cudaDeviceSynchronize());
176        #elif defined(PETSC_HAVE_HIP)
177         PetscCallHIP(hipDeviceSynchronize());
178        #endif
179         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
180         t_start = MPI_Wtime();
181       }
182       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op));
183       PetscCall(PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op));
184       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op));
185       PetscCall(PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op));
186     }
187    #if defined(PETSC_HAVE_CUDA)
188     PetscCallCUDA(cudaDeviceSynchronize());
189    #elif defined(PETSC_HAVE_HIP)
190     PetscCallHIP(hipDeviceSynchronize());
191    #endif
192     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
193     t_end   = MPI_Wtime();
194     time[j] = (t_end - t_start)*1e6 / (niter*2);
195   }
196 
197   PetscCall(PetscPrintf(PETSC_COMM_WORLD,"\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device"));
198   for (n=1,j=0; n<=maxn; n*=2,j++) {
199     PetscCall(PetscSFDestroy(&sf[j]));
200     PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%16" PetscInt_FMT " \t %16.4f\n",sizeof(PetscScalar)*n,time[j]));
201   }
202 
203   PetscCall(PetscFree2(pbuf,ebuf));
204   PetscCall(PetscFreeWithMemType(mtype,rootdata));
205   PetscCall(PetscFreeWithMemType(mtype,leafdata));
206   PetscCall(PetscFree(iremote));
207   PetscCall(PetscFinalize());
208   return 0;
209 }
210 
211 /**TEST
212   testset:
213     # use small numbers to make the test cheap
214     args: -maxn 4 -skipSmall 1 -loopSmall 1
215     filter: grep "DOES_NOT_EXIST"
216     output_file: output/empty.out
217     nsize: 2
218 
219     test:
220       args: -mtype host
221 
222     test:
223       requires: cuda
224       args: -mtype cuda
225 
226     test:
227       requires: hip
228       args: -mtype hip
229 TEST**/
230