xref: /petsc/src/vec/is/sf/tests/ex20.c (revision 503c0ea9b45bcfbcebbb1ea5341243bbc69f0bea)
1 static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n";
2 
3 /*
4   This is a simple test to measure the latency of MPI communication.
5   The test is run with two processes.  The first process sends a message
6   to the second process, and after having received the message, the second
7   process sends a message back to the first process once.  The is repeated
8   a number of times.  The latency is defined as half time of the round-trip.
9 
10   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).
11 
12   Usage: mpirun -n 2 ./ex18 -mtype <type>
13   Other arguments have a default value that is also used in osu_latency.
14 
15   Examples:
16 
17   On Summit at OLCF:
18     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18  -mtype cuda
19 
20   On Crusher at OLCF:
21     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip
22 */
23 
24 #include <petscsf.h>
25 #include <petscdevice.h>
26 #if defined(PETSC_HAVE_UNISTD_H)
27 #include <unistd.h>
28 #endif
29 
30 /* Same values as OSU microbenchmarks */
31 #define LAT_LOOP_SMALL 10000
32 #define LAT_SKIP_SMALL 100
33 #define LAT_LOOP_LARGE 1000
34 #define LAT_SKIP_LARGE 10
35 #define LARGE_MESSAGE_SIZE 8192
36 
37 static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)
38 {
39   PetscErrorCode ierr;
40 
41   PetscFunctionBegin;
42   if (PetscMemTypeHost(mtype)) {
43     #if defined(PETSC_HAVE_GETPAGESIZE)
44       PetscCall(posix_memalign(ptr,getpagesize(),size));
45     #else
46       PetscCall(PetscMalloc(size,ptr));
47     #endif
48   }
49 #if defined(PETSC_HAVE_CUDA)
50   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMalloc(ptr,size));
51 #elif defined(PETSC_HAVE_HIP)
52   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipMalloc(ptr,size));
53 #endif
54   PetscFunctionReturn(0);
55 }
56 
57 static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)
58 {
59   PetscFunctionBegin;
60   if (PetscMemTypeHost(mtype)) {free(ptr);}
61 #if defined(PETSC_HAVE_CUDA)
62   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaFree(ptr));
63 #elif defined(PETSC_HAVE_HIP)
64   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipFree(ptr));
65 #endif
66   PetscFunctionReturn(0);
67 }
68 
69 /* Free memory and set ptr to NULL when succeeded */
70 #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))
71 
72 static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n)
73 {
74   PetscErrorCode ierr;
75 
76   PetscFunctionBegin;
77   if (PetscMemTypeHost(mtype)) PetscCall(PetscMemcpy(dst,src,n));
78 #if defined(PETSC_HAVE_CUDA)
79   else if (PetscMemTypeCUDA(mtype)) PetscCallCUDA(cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice));
80 #elif defined(PETSC_HAVE_HIP)
81   else if (PetscMemTypeHIP(mtype))  PetscCallHIP(hipMemcpy(dst,src,n,hipMemcpyHostToDevice));
82 #endif
83   PetscFunctionReturn(0);
84 }
85 
86 int main(int argc,char **argv)
87 {
88   PetscErrorCode    ierr;
89   PetscSF           sf[64];
90   PetscLogDouble    t_start=0,t_end=0,time[64];
91   PetscInt          i,j,n,nroots,nleaves,niter=100,nskip=10;
92   PetscInt          maxn=512*1024; /* max 4M bytes messages */
93   PetscSFNode       *iremote;
94   PetscMPIInt       rank,size;
95   PetscScalar       *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf;
96   size_t            msgsize;
97   PetscMemType      mtype = PETSC_MEMTYPE_HOST;
98   char              mstring[16]={0};
99   PetscBool         isCuda,isHip,isHost,set;
100   PetscInt          skipSmall=-1,loopSmall=-1;
101   MPI_Op            op = MPI_REPLACE;
102 
103   PetscCall(PetscInitialize(&argc,&argv,NULL,help));
104   /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */
105 #if defined(PETSC_HAVE_CUDA)
106   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
107 #elif defined(PETSC_HAVE_HIP)
108   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_HIP));
109 #endif
110   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size));
111   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD,&rank));
112   PetscCheck(size == 2,PETSC_COMM_WORLD,PETSC_ERR_WRONG_MPI_SIZE,"Must run with 2 processes");
113 
114   PetscCall(PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL)); /* maxn PetscScalars */
115   PetscCall(PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL));
116   PetscCall(PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL));
117 
118   PetscCall(PetscMalloc1(maxn,&iremote));
119   PetscCall(PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set));
120   if (set) {
121     PetscCall(PetscStrcasecmp(mstring,"cuda",&isCuda));
122     PetscCall(PetscStrcasecmp(mstring,"hip",&isHip));
123     PetscCall(PetscStrcasecmp(mstring,"host",&isHost));
124 
125     if (isHost) mtype = PETSC_MEMTYPE_HOST;
126     else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
127     else if (isHip) mtype = PETSC_MEMTYPE_HIP;
128     else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring);
129   }
130 
131   PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata));
132   PetscCall(PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata));
133 
134   PetscCall(PetscMalloc2(maxn,&pbuf,maxn,&ebuf));
135   for (i=0; i<maxn; i++) {
136     pbuf[i] = 123.0;
137     ebuf[i] = 456.0;
138   }
139 
140   for (n=1,i=0; n<=maxn; n*=2,i++) {
141     PetscCall(PetscSFCreate(PETSC_COMM_WORLD,&sf[i]));
142     PetscCall(PetscSFSetFromOptions(sf[i]));
143     if (!rank) {
144       nroots  = n;
145       nleaves = 0;
146     } else {
147       nroots  = 0;
148       nleaves = n;
149       for (j=0; j<nleaves; j++) {
150         iremote[j].rank  = 0;
151         iremote[j].index = j;
152       }
153     }
154     PetscCall(PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES));
155     PetscCall(PetscSFSetUp(sf[i]));
156   }
157 
158   if (loopSmall > 0) {
159     nskip = skipSmall;
160     niter = loopSmall;
161   } else {
162     nskip = LAT_SKIP_SMALL;
163     niter = LAT_LOOP_SMALL;
164   }
165 
166   for (n=1,j=0; n<=maxn; n*=2,j++) {
167     msgsize = sizeof(PetscScalar)*n;
168     PetscCall(PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize));
169     PetscCall(PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize));
170 
171     if (msgsize > LARGE_MESSAGE_SIZE) {
172       nskip = LAT_SKIP_LARGE;
173       niter = LAT_LOOP_LARGE;
174     }
175     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
176 
177     for (i=0; i<niter + nskip; i++) {
178       if (i == nskip) {
179        #if defined(PETSC_HAVE_CUDA)
180         PetscCallCUDA(cudaDeviceSynchronize());
181        #elif defined(PETSC_HAVE_HIP)
182         PetscCallHIP(hipDeviceSynchronize());
183        #endif
184         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
185         t_start = MPI_Wtime();
186       }
187       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op));
188       PetscCall(PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op));
189       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op));
190       PetscCall(PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op));
191     }
192    #if defined(PETSC_HAVE_CUDA)
193     PetscCallCUDA(cudaDeviceSynchronize());
194    #elif defined(PETSC_HAVE_HIP)
195     PetscCallHIP(hipDeviceSynchronize());
196    #endif
197     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
198     t_end   = MPI_Wtime();
199     time[j] = (t_end - t_start)*1e6 / (niter*2);
200   }
201 
202   PetscCall(PetscPrintf(PETSC_COMM_WORLD,"\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device"));
203   for (n=1,j=0; n<=maxn; n*=2,j++) {
204     PetscCall(PetscSFDestroy(&sf[j]));
205     PetscCall(PetscPrintf(PETSC_COMM_WORLD,"%16D \t %16.4f\n",sizeof(PetscScalar)*n,time[j]));
206   }
207 
208   PetscCall(PetscFree2(pbuf,ebuf));
209   PetscCall(PetscFreeWithMemType(mtype,rootdata));
210   PetscCall(PetscFreeWithMemType(mtype,leafdata));
211   PetscCall(PetscFree(iremote));
212   PetscCall(PetscFinalize());
213   return 0;
214 }
215 
216 /**TEST
217   testset:
218     # use small numbers to make the test cheap
219     args: -maxn 4 -skipSmall 1 -loopSmall 1
220     filter: grep "DOES_NOT_EXIST"
221     output_file: output/empty.out
222     nsize: 2
223 
224     test:
225       args: -mtype host
226 
227     test:
228       requires: cuda
229       args: -mtype cuda
230 
231     test:
232       requires: hip
233       args: -mtype hip
234 TEST**/
235