xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 #include <stdio.h>
2 #include <math.h>
3 #include <limits.h>
4 #include <float.h>
5 #include <petscsys.h>
6 
7 /*
8   Program: Stream
9   Programmer: Joe R. Zagar
10   Revision: 4.0-BETA, October 24, 1995
11   Original code developed by John D. McCalpin
12 
13   This program measures memory transfer rates in MB/s for simple
14   computational kernels coded in C.  These numbers reveal the quality
15   of code generation for simple uncacheable kernels as well as showing
16   the cost of floating-point operations relative to memory accesses.
17 
18   INSTRUCTIONS:
19 
20         1) Stream requires a good bit of memory to run.  Adjust the
21            value of 'N' (below) to give a 'timing calibration' of
22            at least 20 clock-ticks.  This will provide rate estimates
23            that should be good to about 5% precision.
24 */
25 
26 #define N      2000000
27 #define M      20
28 #define NTIMES 50
29 #define OFFSET 0
30 
31 /*
32        3) Compile the code with full optimization.  Many compilers
33           generate unreasonably bad code before the optimizer tightens
34           things up.  If the results are unreasonably good, on the
35           other hand, the optimizer might be too smart for me!
36 
37           Try compiling with:
38                 cc -O stream_d.c second.c -o stream_d -lm
39 
40           This is known to work on Cray, SGI, IBM, and Sun machines.
41 
42        4) Mail the results to mccalpin@cs.virginia.edu
43           Be sure to include:
44                a) computer hardware model number and software revision
45                b) the compiler flags
46                c) all of the output from the test case.
47   Thanks!
48 
49  */
50 
51 #define HLINE "-------------------------------------------------------------\n"
52 
53 static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET];
54 
55 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
56 
57 static int checktick(void)
58 {
59   int    minDelta = 1000000;
60   double timesfound[M];
61 
62   /* Collect a sequence of M unique time values from the system. */
63 
64   for (int i = 0; i < M; ++i) {
65     const double t1 = MPI_Wtime();
66 
67     while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ;
68   }
69 
70   /*
71     Determine the minimum difference between these M values.
72     This result will be our estimate (in microseconds) for the
73     clock granularity.
74   */
75 
76   for (int i = 1; i < M; ++i) {
77     int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1]));
78 
79     minDelta  = PetscMin(minDelta,PetscMax(Delta,0));
80   }
81   return minDelta;
82 }
83 
84 static double bytes[4] = {
85   2 * sizeof(double) * N,
86   2 * sizeof(double) * N,
87   3 * sizeof(double) * N,
88   3 * sizeof(double) * N
89 };
90 
91 int main(int argc,char **args)
92 {
93   const double scalar = 3.0;
94   double       t,times[4][NTIMES],irate[4],rate[4];
95   PetscMPIInt  rank,size,resultlen;
96   char         hostname[MPI_MAX_PROCESSOR_NAME] = {0};
97 
98   PetscCall(PetscInitialize(&argc,&args,NULL,NULL));
99   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
100   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
101 
102   PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen;
103   if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD));
104   else {
105     for (int j = 1; j < size; ++j) {
106       PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE));
107     }
108   }
109   PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
110 
111   /* --- SETUP --- determine precision and check timing --- */
112 
113   if (rank == 0) {
114     /*
115       printf(HLINE);
116       printf("Array size = %d, Offset = %d\n" , N, OFFSET);
117       printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
118       printf("Each test is run %d times, but only\n", NTIMES);
119       printf("the *best* time for each is used.\n");
120       printf(HLINE);
121     */
122   }
123 
124   /* Get initial value for system clock. */
125   for (int j = 0; j < N; ++j) {
126     a[j] = 1.0;
127     b[j] = 2.0;
128     c[j] = 0.0;
129   }
130 
131   if (rank == 0) {
132     int quantum;
133     if  ((quantum = checktick()) >= 1) { } /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
134     else { } /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
135   }
136 
137   t = MPI_Wtime();
138   for (int j = 0; j < N; ++j) a[j] *= 2.0;
139   t = 1.0E6 * (MPI_Wtime() - t);
140 
141   if (rank == 0) {
142     /*
143       printf("Each test below will take on the order of %d microseconds.\n", (int) t);
144       printf("   (= %d clock ticks)\n", (int) (t/quantum));
145       printf("Increase the size of the arrays if this shows that\n");
146       printf("you are not getting at least 20 clock ticks per test.\n");
147       printf(HLINE);
148     */
149   }
150 
151   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
152 
153   for (int k = 0; k < NTIMES; ++k) {
154     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
155     times[0][k] = MPI_Wtime();
156     /* should all these barriers be pulled outside of the time call? */
157     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
158     PetscCall(PetscArraycpy(c,a,N));
159     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
160     times[0][k] = MPI_Wtime() - times[0][k];
161 
162     times[1][k] = MPI_Wtime();
163     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
164     for (int j = 0; j < N; ++j) b[j] = scalar*c[j];
165     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
166     times[1][k] = MPI_Wtime() - times[1][k];
167 
168     times[2][k] = MPI_Wtime();
169     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
170     for (int j = 0; j < N; ++j) c[j] = a[j]+b[j];
171     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
172     times[2][k] = MPI_Wtime() - times[2][k];
173 
174     times[3][k] = MPI_Wtime();
175     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
176     for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j];
177     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
178     times[3][k] = MPI_Wtime() - times[3][k];
179   }
180 
181   /*   --- SUMMARY --- */
182 
183   for (int k = 0; k < NTIMES; ++k) {
184     for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]);
185   }
186 
187   for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
188   PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD));
189 
190   if (rank == 0) {
191     FILE *fd;
192 
193     if (size != 1) {
194       double prate;
195 
196       fd = fopen("flops","r");
197       fscanf(fd,"%lg",&prate);
198       fclose(fd);
199       printf("%d %11.4f   Rate (MB/s) %g \n",size,rate[3],rate[3]/prate);
200     } else {
201       fd = fopen("flops","w");
202       fprintf(fd,"%g\n",rate[3]);
203       fclose(fd);
204       printf("%d %11.4f   Rate (MB/s)\n",size,rate[3]);
205     }
206   }
207   PetscCall(PetscFinalize());
208   return 0;
209 }
210