xref: /petsc/src/benchmarks/streams/OpenMPVersion.c (revision 030f984af8d8bb4c203755d35bded3c05b3d83ce)
1 /*-----------------------------------------------------------------------*/
2 /* Program: Stream                                                       */
3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */
4 /* Original code developed by John D. McCalpin                           */
5 /* Programmers: John D. McCalpin                                         */
6 /*              Joe R. Zagar                                             */
7 /*                                                                       */
8 /* This program measures memory transfer rates in MB/s for simple        */
9 /* computational kernels coded in C.                                     */
10 /*-----------------------------------------------------------------------*/
11 /* Copyright 1991-2005: John D. McCalpin                                 */
12 /*-----------------------------------------------------------------------*/
13 /* License:                                                              */
14 /*  1. You are free to use this program and/or to redistribute           */
15 /*     this program.                                                     */
16 /*  2. You are free to modify this program for your own use,             */
17 /*     including commercial use, subject to the publication              */
18 /*     restrictions in item 3.                                           */
19 /*  3. You are free to publish results obtained from running this        */
20 /*     program, or from works that you derive from this program,         */
21 /*     with the following limitations:                                   */
22 /*     3a. In order to be referred to as "STREAM benchmark results",     */
23 /*         published results must be in conformance to the STREAM        */
24 /*         Run Rules, (briefly reviewed below) published at              */
25 /*         http://www.cs.virginia.edu/stream/ref.html                    */
26 /*         and incorporated herein by reference.                         */
27 /*         As the copyright holder, John McCalpin retains the            */
28 /*         right to determine conformity with the Run Rules.             */
29 /*     3b. Results based on modified source code or on runs not in       */
30 /*         accordance with the STREAM Run Rules must be clearly          */
31 /*         labelled whenever they are published.  Examples of            */
32 /*         proper labelling include:                                     */
33 /*         "tuned STREAM benchmark results"                              */
34 /*         "based on a variant of the STREAM benchmark code"             */
35 /*         Other comparable, clear and reasonable labelling is           */
36 /*         acceptable.                                                   */
37 /*     3c. Submission of results to the STREAM benchmark web site        */
38 /*         is encouraged, but not required.                              */
39 /*  4. Use of this program or creation of derived works based on this    */
40 /*     program constitutes acceptance of these licensing restrictions.   */
41 /*  5. Absolutely no warranty is expressed or implied.                   */
42 /*-----------------------------------------------------------------------*/
43 # include <stdio.h>
44 # include <math.h>
45 # include <limits.h>
46 # include <float.h>
47 # include <sys/time.h>
48 #include <stdlib.h>
49 
50 /* INSTRUCTIONS:
51  *
52  *      1) Stream requires a good bit of memory to run.  Adjust the
53  *          value of 'N' (below) to give a 'timing calibration' of
54  *          at least 20 clock-ticks.  This will provide rate estimates
55  *          that should be good to about 5% precision.
56  */
57 
58 #if !defined(N)
59 #   define N    2000000
60 #endif
61 #if !defined(NTIMES)
62 #   define NTIMES       50
63 #endif
64 #if !defined(OFFSET)
65 #   define OFFSET       0
66 #endif
67 
68 /*
69  *      3) Compile the code with full optimization.  Many compilers
70  *         generate unreasonably bad code before the optimizer tightens
71  *         things up.  If the results are unreasonably good, on the
72  *         other hand, the optimizer might be too smart for me!
73  *
74  *         Try compiling with:
75  *               cc -O stream_omp.c -o stream_omp
76  *
77  *         This is known to work on Cray, SGI, IBM, and Sun machines.
78  *
79  *
80  *      4) Mail the results to mccalpin@cs.virginia.edu
81  *         Be sure to include:
82  *              a) computer hardware model number and software revision
83  *              b) the compiler flags
84  *              c) all of the output from the test case.
85  * Thanks!
86  *
87  */
88 
89 # define HLINE "-------------------------------------------------------------\n"
90 
91 # if !defined(MIN)
92 # define MIN(x,y) ((x)<(y) ? (x) : (y))
93 # endif
94 # if !defined(MAX)
95 # define MAX(x,y) ((x)>(y) ? (x) : (y))
96 # endif
97 
98 static double a[N+OFFSET],
99               b[N+OFFSET],
100               c[N+OFFSET];
101 
102 static double avgtime[4] = {0}, maxtime[4] = {0},
103               mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
104 
105 static double bytes[1] = {
106   3 * sizeof(double) * N
107 };
108 
109 extern double mysecond();
110 extern int omp_get_num_threads();
111 int main()
112 {
113   int          quantum, checktick();
114   register int j, k;
115   double       scalar, t, times[4][NTIMES],rate;
116   int          size;
117   char         *env;
118   FILE         *fd;
119 
120   env = getenv("OMP_NUM_THREADS");
121   sscanf(env,"%d",&size);
122   /* --- SETUP --- determine precision and check timing --- */
123 
124   /*printf(HLINE);
125   printf("STREAM version $Revision: 5.9 $\n");
126    printf(HLINE); */
127   /*    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
128    BytesPerWord);
129 
130    printf(HLINE);
131 #if defined(NO_LONG_LONG)
132   printf("Array size = %d, Offset = %d\n" , N, OFFSET);
133 #else
134   printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET);
135 #endif
136 
137   printf("Total memory required = %.1f MB.\n",
138       (3.0 * BytesPerWord) * ((double) N / 1048576.0));
139   printf("Each test is run %d times, but only\n", NTIMES);
140   printf("the *best* time for each is used.\n");
141 
142    printf(HLINE); */
143 
144   /* Get initial value for system clock. */
145 #pragma omp parallel for
146   for (j=0; j<N; j++) {
147     a[j] = 1.0;
148     b[j] = 2.0;
149     c[j] = 0.0;
150   }
151 
152   /*printf(HLINE);*/
153 
154   if  ((quantum = checktick()) >= 1) ; /*  printf("Your clock granularity/precision appears to be "
155         "%d microseconds.\n", quantum);*/
156   else {
157     ;  /*  printf("Your clock granularity appears to be "
158         "less than one microsecond.\n");*/
159     quantum = 1;
160   }
161 
162   t = mysecond();
163 #pragma omp parallel for
164   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
165   t = 1.0E6 * (mysecond() - t);
166 
167   /*printf("Each test below will take on the order"
168       " of %d microseconds.\n", (int) t);
169   printf("   (= %d clock ticks)\n", (int) (t/quantum));
170   printf("Increase the size of the arrays if this shows that\n");
171   printf("you are not getting at least 20 clock ticks per test.\n");
172 
173    printf(HLINE);*/
174 
175   /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */
176 
177   scalar = 3.0;
178   for (k=0; k<NTIMES; k++)
179   {
180     times[0][k] = mysecond();
181 #pragma omp parallel for
182     for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
183     times[0][k] = mysecond() - times[0][k];
184   }
185 
186   /*  --- SUMMARY --- */
187 
188   for (k=1; k<NTIMES; k++) {  /* note -- skip first iteration */
189     for (j=0; j<1; j++)
190     {
191       avgtime[j] = avgtime[j] + times[j][k];
192       mintime[j] = MIN(mintime[j], times[j][k]);
193       maxtime[j] = MAX(maxtime[j], times[j][k]);
194     }
195   }
196 
197   rate = 1.0E-06 * bytes[0]/mintime[0];
198 
199   if (size == 1) {
200     printf("%d %11.4f   Rate (MB/s)\n",size, rate);
201     fd = fopen("flops","w");
202     fprintf(fd,"%g\n",rate);
203     fclose(fd);
204   } else {
205     double prate;
206     fd = fopen("flops","r");
207     fscanf(fd,"%lg",&prate);
208     fclose(fd);
209     printf("%d %11.4f   Rate (MB/s) %g \n", size, rate,rate/prate);
210   }
211 
212   return 0;
213 }
214 
215 # define        M        20
216 
217 int checktick()
218 {
219   int    i, minDelta, Delta;
220   double t1, t2, timesfound[M];
221 
222 /*  Collect a sequence of M unique time values from the system. */
223 
224   for (i = 0; i < M; i++) {
225     t1 = mysecond();
226     while (((t2=mysecond()) - t1) < 1.0E-6) ;
227     timesfound[i] = t1 = t2;
228   }
229 
230 /*
231  * Determine the minimum difference between these M values.
232  * This result will be our estimate (in microseconds) for the
233  * clock granularity.
234  */
235 
236   minDelta = 1000000;
237   for (i = 1; i < M; i++) {
238     Delta    = (int)(1.0E6 * (timesfound[i]-timesfound[i-1]));
239     minDelta = MIN(minDelta, MAX(Delta,0));
240   }
241 
242   return(minDelta);
243 }
244 
245 /* A gettimeofday routine to give access to the wall
246    clock timer on most UNIX-like systems.  */
247 
248 #include <sys/time.h>
249 
250 double mysecond()
251 {
252   struct timeval  tp;
253   struct timezone tzp;
254 
255   (void) gettimeofday(&tp,&tzp);
256   return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6);
257 }
258 
259