xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision abbcd2d45932c6dbad0ca42a290b3a741dfdc5ac) !
1 
2 # include <stdio.h>
3 # include <math.h>
4 # include <limits.h>
5 # include <float.h>
6 #include <petscsys.h>
7 
8 /*
9   Program: Stream
10   Programmer: Joe R. Zagar
11   Revision: 4.0-BETA, October 24, 1995
12   Original code developed by John D. McCalpin
13 
14   This program measures memory transfer rates in MB/s for simple
15   computational kernels coded in C.  These numbers reveal the quality
16   of code generation for simple uncacheable kernels as well as showing
17   the cost of floating-point operations relative to memory accesses.
18 
19   INSTRUCTIONS:
20 
21         1) Stream requires a good bit of memory to run.  Adjust the
22            value of 'N' (below) to give a 'timing calibration' of
23            at least 20 clock-ticks.  This will provide rate estimates
24            that should be good to about 5% precision.
25 */
26 
27 # define N      2000000
28 # define NTIMES 50
29 # define OFFSET 0
30 
31 /*
32        3) Compile the code with full optimization.  Many compilers
33           generate unreasonably bad code before the optimizer tightens
34           things up.  If the results are unreasonably good, on the
35           other hand, the optimizer might be too smart for me!
36 
37           Try compiling with:
38                 cc -O stream_d.c second.c -o stream_d -lm
39 
40           This is known to work on Cray, SGI, IBM, and Sun machines.
41 
42 
43        4) Mail the results to mccalpin@cs.virginia.edu
44           Be sure to include:
45                a) computer hardware model number and software revision
46                b) the compiler flags
47                c) all of the output from the test case.
48   Thanks!
49 
50  */
51 
52 # define HLINE "-------------------------------------------------------------\n"
53 
54 # ifndef MIN
55 # define MIN(x,y) ((x)<(y) ? (x) : (y))
56 # endif
57 # ifndef MAX
58 # define MAX(x,y) ((x)>(y) ? (x) : (y))
59 # endif
60 
61 static double a[N+OFFSET],
62               b[N+OFFSET],
63               c[N+OFFSET];
64 /*double *a,*b,*c;*/
65 
66 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
67 
68 static double bytes[4] = {
69   2 * sizeof(double) * N,
70   2 * sizeof(double) * N,
71   3 * sizeof(double) * N,
72   3 * sizeof(double) * N
73 };
74 
75 int main(int argc,char **args)
76 {
77   int            quantum, checktick(void);
78   register int   j, k;
79   double         scalar, t, times[4][NTIMES],irate[4],rate[4];
80   int            rank,size,resultlen;
81   char           hostname[MPI_MAX_PROCESSOR_NAME];
82   MPI_Status     status;
83   int            ierr;
84   FILE           *fd;
85 
86   ierr = PetscInitialize(&argc,&args,NULL,NULL);if (ierr) return ierr;
87   ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr;
88   ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr;
89 
90   for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) {
91     hostname[j] = 0;
92   }
93   ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr;
94   if (!rank) {
95     for (j=1; j<size; j++) {
96       ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr;
97     }
98  } else {
99    ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr;
100  }
101  ierr = MPI_Barrier(MPI_COMM_WORLD);
102 
103   /* --- SETUP --- determine precision and check timing --- */
104 
105   if (!rank) {
106     /*printf(HLINE);
107     printf("Array size = %d, Offset = %d\n" , N, OFFSET);
108     printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
109     printf("Each test is run %d times, but only\n", NTIMES);
110     printf("the *best* time for each is used.\n");
111     printf(HLINE); */
112   }
113 
114   /* Get initial value for system clock. */
115 
116   /*  a = malloc(N*sizeof(double));
117   b = malloc(N*sizeof(double));
118   c = malloc(N*sizeof(double));*/
119   for (j=0; j<N; j++) {
120     a[j] = 1.0;
121     b[j] = 2.0;
122     c[j] = 0.0;
123   }
124 
125   if (!rank) {
126     if  ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
127     else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
128   }
129 
130   t = MPI_Wtime();
131   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
132   t = 1.0E6 * (MPI_Wtime() - t);
133 
134   if (!rank) {
135     /*  printf("Each test below will take on the order of %d microseconds.\n", (int) t);
136     printf("   (= %d clock ticks)\n", (int) (t/quantum));
137     printf("Increase the size of the arrays if this shows that\n");
138     printf("you are not getting at least 20 clock ticks per test.\n");
139     printf(HLINE);*/
140   }
141 
142   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
143 
144   scalar = 3.0;
145   for (k=0; k<NTIMES; k++)
146   {
147     ierr = MPI_Barrier(MPI_COMM_WORLD);
148     times[0][k] = MPI_Wtime();
149     /* should all these barriers be pulled outside of the time call? */
150     ierr = MPI_Barrier(MPI_COMM_WORLD);
151     for (j=0; j<N; j++) c[j] = a[j];
152     ierr = MPI_Barrier(MPI_COMM_WORLD);
153     times[0][k] = MPI_Wtime() - times[0][k];
154 
155     times[1][k] = MPI_Wtime();
156     ierr = MPI_Barrier(MPI_COMM_WORLD);
157     for (j=0; j<N; j++) b[j] = scalar*c[j];
158     ierr = MPI_Barrier(MPI_COMM_WORLD);
159     times[1][k] = MPI_Wtime() - times[1][k];
160 
161     times[2][k] = MPI_Wtime();
162     ierr = MPI_Barrier(MPI_COMM_WORLD);
163     for (j=0; j<N; j++) c[j] = a[j]+b[j];
164     ierr = MPI_Barrier(MPI_COMM_WORLD);
165     times[2][k] = MPI_Wtime() - times[2][k];
166 
167     times[3][k] = MPI_Wtime();
168     ierr = MPI_Barrier(MPI_COMM_WORLD);
169     for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
170     ierr = MPI_Barrier(MPI_COMM_WORLD);
171     times[3][k] = MPI_Wtime() - times[3][k];
172   }
173 
174   /*   --- SUMMARY --- */
175 
176   for (k=0; k<NTIMES; k++)
177     for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]);
178 
179   for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
180   ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
181   if (ierr) printf("Error calling MPI\n");
182 
183   if (!rank) {
184     if (size == 1) {
185       printf("%d %11.4f   Rate (MB/s)\n",size, rate[3]);
186       fd = fopen("flops","w");
187       fprintf(fd,"%g\n",rate[3]);
188       fclose(fd);
189     } else {
190       double prate;
191       fd = fopen("flops","r");
192       fscanf(fd,"%lg",&prate);
193       fclose(fd);
194       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate[3],rate[3]/prate);
195     }
196   }
197   PetscFinalize();
198   return 0;
199 }
200 
201 # define        M        20
202 
203 int checktick(void)
204 {
205   int    i, minDelta, Delta;
206   double t1, t2, timesfound[M];
207 
208 /*  Collect a sequence of M unique time values from the system. */
209 
210   for (i = 0; i < M; i++) {
211     t1 = MPI_Wtime();
212     while (((t2=MPI_Wtime()) - t1) < 1.0E-6) ;
213     timesfound[i] = t1 = t2;
214   }
215 
216 /*
217   Determine the minimum difference between these M values.
218   This result will be our estimate (in microseconds) for the
219   clock granularity.
220  */
221 
222   minDelta = 1000000;
223   for (i = 1; i < M; i++) {
224     Delta    = (int)(1.0E6 * (timesfound[i]-timesfound[i-1]));
225     minDelta = MIN(minDelta, MAX(Delta,0));
226   }
227 
228   return(minDelta);
229 }
230 
231