xref: /petsc/src/benchmarks/streams/MPIVersion.c (revision dfd57a172ac9fa6c7b5fe6de6ab5df85cefc2996)
1 
2 # include <stdio.h>
3 # include <math.h>
4 # include <limits.h>
5 # include <float.h>
6 #include <petscsys.h>
7 
8 /*
9   Program: Stream
10   Programmer: Joe R. Zagar
11   Revision: 4.0-BETA, October 24, 1995
12   Original code developed by John D. McCalpin
13 
14   This program measures memory transfer rates in MB/s for simple
15   computational kernels coded in C.  These numbers reveal the quality
16   of code generation for simple uncacheable kernels as well as showing
17   the cost of floating-point operations relative to memory accesses.
18 
19   INSTRUCTIONS:
20 
21         1) Stream requires a good bit of memory to run.  Adjust the
22            value of 'N' (below) to give a 'timing calibration' of
23            at least 20 clock-ticks.  This will provide rate estimates
24            that should be good to about 5% precision.
25 */
26 
27 # define N      2000000
28 # define NTIMES 50
29 # define OFFSET 0
30 
31 /*
32        3) Compile the code with full optimization.  Many compilers
33           generate unreasonably bad code before the optimizer tightens
34           things up.  If the results are unreasonably good, on the
35           other hand, the optimizer might be too smart for me!
36 
37           Try compiling with:
38                 cc -O stream_d.c second.c -o stream_d -lm
39 
40           This is known to work on Cray, SGI, IBM, and Sun machines.
41 
42        4) Mail the results to mccalpin@cs.virginia.edu
43           Be sure to include:
44                a) computer hardware model number and software revision
45                b) the compiler flags
46                c) all of the output from the test case.
47   Thanks!
48 
49  */
50 
51 # define HLINE "-------------------------------------------------------------\n"
52 
53 # ifndef MIN
54 # define MIN(x,y) ((x)<(y) ? (x) : (y))
55 # endif
56 # ifndef MAX
57 # define MAX(x,y) ((x)>(y) ? (x) : (y))
58 # endif
59 
60 static double a[N+OFFSET],
61               b[N+OFFSET],
62               c[N+OFFSET];
63 /*double *a,*b,*c;*/
64 
65 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
66 
67 static double bytes[4] = {
68   2 * sizeof(double) * N,
69   2 * sizeof(double) * N,
70   3 * sizeof(double) * N,
71   3 * sizeof(double) * N
72 };
73 
74 int main(int argc,char **args)
75 {
76   int            quantum, checktick(void);
77   register int   j, k;
78   double         scalar, t, times[4][NTIMES],irate[4],rate[4];
79   int            rank,size,resultlen;
80   char           hostname[MPI_MAX_PROCESSOR_NAME];
81   MPI_Status     status;
82   int            ierr;
83   FILE           *fd;
84 
85   ierr = PetscInitialize(&argc,&args,NULL,NULL);if (ierr) return ierr;
86   ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr;
87   ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr;
88 
89   for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) {
90     hostname[j] = 0;
91   }
92   ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr;
93   if (!rank) {
94     for (j=1; j<size; j++) {
95       ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr;
96     }
97  } else {
98    ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr;
99  }
100  ierr = MPI_Barrier(MPI_COMM_WORLD);
101 
102   /* --- SETUP --- determine precision and check timing --- */
103 
104   if (!rank) {
105     /*printf(HLINE);
106     printf("Array size = %d, Offset = %d\n" , N, OFFSET);
107     printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0);
108     printf("Each test is run %d times, but only\n", NTIMES);
109     printf("the *best* time for each is used.\n");
110     printf(HLINE); */
111   }
112 
113   /* Get initial value for system clock. */
114 
115   /*  a = malloc(N*sizeof(double));
116   b = malloc(N*sizeof(double));
117   c = malloc(N*sizeof(double));*/
118   for (j=0; j<N; j++) {
119     a[j] = 1.0;
120     b[j] = 2.0;
121     c[j] = 0.0;
122   }
123 
124   if (!rank) {
125     if  ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
126     else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/
127   }
128 
129   t = MPI_Wtime();
130   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
131   t = 1.0E6 * (MPI_Wtime() - t);
132 
133   if (!rank) {
134     /*  printf("Each test below will take on the order of %d microseconds.\n", (int) t);
135     printf("   (= %d clock ticks)\n", (int) (t/quantum));
136     printf("Increase the size of the arrays if this shows that\n");
137     printf("you are not getting at least 20 clock ticks per test.\n");
138     printf(HLINE);*/
139   }
140 
141   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
142 
143   scalar = 3.0;
144   for (k=0; k<NTIMES; k++)
145   {
146     ierr = MPI_Barrier(MPI_COMM_WORLD);
147     times[0][k] = MPI_Wtime();
148     /* should all these barriers be pulled outside of the time call? */
149     ierr = MPI_Barrier(MPI_COMM_WORLD);
150     for (j=0; j<N; j++) c[j] = a[j];
151     ierr = MPI_Barrier(MPI_COMM_WORLD);
152     times[0][k] = MPI_Wtime() - times[0][k];
153 
154     times[1][k] = MPI_Wtime();
155     ierr = MPI_Barrier(MPI_COMM_WORLD);
156     for (j=0; j<N; j++) b[j] = scalar*c[j];
157     ierr = MPI_Barrier(MPI_COMM_WORLD);
158     times[1][k] = MPI_Wtime() - times[1][k];
159 
160     times[2][k] = MPI_Wtime();
161     ierr = MPI_Barrier(MPI_COMM_WORLD);
162     for (j=0; j<N; j++) c[j] = a[j]+b[j];
163     ierr = MPI_Barrier(MPI_COMM_WORLD);
164     times[2][k] = MPI_Wtime() - times[2][k];
165 
166     times[3][k] = MPI_Wtime();
167     ierr = MPI_Barrier(MPI_COMM_WORLD);
168     for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
169     ierr = MPI_Barrier(MPI_COMM_WORLD);
170     times[3][k] = MPI_Wtime() - times[3][k];
171   }
172 
173   /*   --- SUMMARY --- */
174 
175   for (k=0; k<NTIMES; k++)
176     for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]);
177 
178   for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j];
179   ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
180   if (ierr) printf("Error calling MPI\n");
181 
182   if (!rank) {
183     if (size == 1) {
184       printf("%d %11.4f   Rate (MB/s)\n",size, rate[3]);
185       fd = fopen("flops","w");
186       fprintf(fd,"%g\n",rate[3]);
187       fclose(fd);
188     } else {
189       double prate;
190       fd = fopen("flops","r");
191       fscanf(fd,"%lg",&prate);
192       fclose(fd);
193       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate[3],rate[3]/prate);
194     }
195   }
196   PetscFinalize();
197   return 0;
198 }
199 
200 # define        M        20
201 
202 int checktick(void)
203 {
204   int    i, minDelta, Delta;
205   double t1, t2, timesfound[M];
206 
207 /*  Collect a sequence of M unique time values from the system. */
208 
209   for (i = 0; i < M; i++) {
210     t1 = MPI_Wtime();
211     while (((t2=MPI_Wtime()) - t1) < 1.0E-6) ;
212     timesfound[i] = t1 = t2;
213   }
214 
215 /*
216   Determine the minimum difference between these M values.
217   This result will be our estimate (in microseconds) for the
218   clock granularity.
219  */
220 
221   minDelta = 1000000;
222   for (i = 1; i < M; i++) {
223     Delta    = (int)(1.0E6 * (timesfound[i]-timesfound[i-1]));
224     minDelta = MIN(minDelta, MAX(Delta,0));
225   }
226 
227   return(minDelta);
228 }
229 
230