xref: /petsc/src/benchmarks/streams/BasicVersion.c (revision 9d47de495d3c23378050c1b4a410c12a375cb6c6)
15d28107eSBarry Smith #include <sys/time.h>
25d28107eSBarry Smith /* int gettimeofday(struct timeval *tp, struct timezone *tzp); */
35d28107eSBarry Smith 
second()45d28107eSBarry Smith double second()
55d28107eSBarry Smith {
65d28107eSBarry Smith   /* struct timeval { long tv_sec;
75d28107eSBarry Smith                     long tv_usec; };
85d28107eSBarry Smith 
95d28107eSBarry Smith struct timezone { int tz_minuteswest;
105d28107eSBarry Smith                   int tz_dsttime; }; */
115d28107eSBarry Smith 
125d28107eSBarry Smith   struct timeval  tp;
135d28107eSBarry Smith   struct timezone tzp;
145d28107eSBarry Smith   int             i;
155d28107eSBarry Smith 
165d28107eSBarry Smith   i = gettimeofday(&tp, &tzp);
175d28107eSBarry Smith   return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
185d28107eSBarry Smith }
195d28107eSBarry Smith #include <stdio.h>
205d28107eSBarry Smith #include <math.h>
215d28107eSBarry Smith #include <limits.h>
220d04baf8SBarry Smith #include <float.h>
235d28107eSBarry Smith #include <sys/time.h>
245d28107eSBarry Smith 
255d28107eSBarry Smith /*
260e3d61c9SBarry Smith   Program: Stream
270e3d61c9SBarry Smith   Programmer: Joe R. Zagar
280e3d61c9SBarry Smith   Revision: 4.0-BETA, October 24, 1995
290e3d61c9SBarry Smith   Original code developed by John D. McCalpin
300e3d61c9SBarry Smith 
310e3d61c9SBarry Smith   This program measures memory transfer rates in MB/s for simple
320e3d61c9SBarry Smith   computational kernels coded in C.  These numbers reveal the quality
330e3d61c9SBarry Smith   of code generation for simple uncacheable kernels as well as showing
340e3d61c9SBarry Smith   the cost of floating-point operations relative to memory accesses.
350e3d61c9SBarry Smith 
360e3d61c9SBarry Smith   INSTRUCTIONS:
370e3d61c9SBarry Smith 
380e3d61c9SBarry Smith         1) Stream requires a good bit of memory to run.  Adjust the
390e3d61c9SBarry Smith            value of 'N' (below) to give a 'timing calibration' of
400e3d61c9SBarry Smith            at least 20 clock-ticks.  This will provide rate estimates
410e3d61c9SBarry Smith            that should be good to about 5% precision.
425d28107eSBarry Smith  */
435d28107eSBarry Smith 
44d3ae85c4SBarry Smith #define N      200000
455d28107eSBarry Smith #define NTIMES 50
465d28107eSBarry Smith #define OFFSET 0
475d28107eSBarry Smith 
485d28107eSBarry Smith /*
490e3d61c9SBarry Smith        3) Compile the code with full optimization.  Many compilers
500e3d61c9SBarry Smith           generate unreasonably bad code before the optimizer tightens
510e3d61c9SBarry Smith           things up.  If the results are unreasonably good, on the
520e3d61c9SBarry Smith           other hand, the optimizer might be too smart for me!
530e3d61c9SBarry Smith 
540e3d61c9SBarry Smith           Try compiling with:
550e3d61c9SBarry Smith                 cc -O stream_d.c second.c -o stream_d -lm
560e3d61c9SBarry Smith 
570e3d61c9SBarry Smith           This is known to work on Cray, SGI, IBM, and Sun machines.
580e3d61c9SBarry Smith 
590e3d61c9SBarry Smith        4) Mail the results to mccalpin@cs.virginia.edu
600e3d61c9SBarry Smith           Be sure to include:
610e3d61c9SBarry Smith                  a) computer hardware model number and software revision
620e3d61c9SBarry Smith                  b) the compiler flags
630e3d61c9SBarry Smith                  c) all of the output from the test case.
640e3d61c9SBarry Smith   Thanks!
650e3d61c9SBarry Smith 
665d28107eSBarry Smith */
675d28107eSBarry Smith 
685d28107eSBarry Smith #define HLINE "-------------------------------------------------------------\n"
695d28107eSBarry Smith 
70*beceaeb6SBarry Smith #if !defined(MIN)
715d28107eSBarry Smith   #define MIN(x, y) ((x) < (y) ? (x) : (y))
725d28107eSBarry Smith #endif
73*beceaeb6SBarry Smith #if !defined(MAX)
745d28107eSBarry Smith   #define MAX(x, y) ((x) > (y) ? (x) : (y))
755d28107eSBarry Smith #endif
765d28107eSBarry Smith 
7767595998SJunchao Zhang static double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET];
785d28107eSBarry Smith /*double *a,*b,*c;*/
795d28107eSBarry Smith 
80df4a11deSBarry Smith static double mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
815d28107eSBarry Smith 
82df4a11deSBarry Smith static const char *label[4] = {"Copy:      ", "Scale:     ", "Add:       ", "Triad:     "};
835d28107eSBarry Smith 
8467595998SJunchao Zhang static double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N};
855d28107eSBarry Smith 
865d28107eSBarry Smith extern double second();
875d28107eSBarry Smith 
main(int argc,char ** args)8801a79839SBarry Smith int main(int argc, char **args)
895d28107eSBarry Smith {
90d1d3a73cSBarry Smith   int          checktick(void);
915d28107eSBarry Smith   register int j, k;
92d3ae85c4SBarry Smith   double       scalar, t, times[4][NTIMES], irate[4];
93df4a11deSBarry Smith 
945d28107eSBarry Smith   /* --- SETUP --- determine precision and check timing --- */
955d28107eSBarry Smith 
965d28107eSBarry Smith   for (j = 0; j < N; j++) {
975d28107eSBarry Smith     a[j] = 1.0;
985d28107eSBarry Smith     b[j] = 2.0;
995d28107eSBarry Smith     c[j] = 0.0;
1005d28107eSBarry Smith   }
1015d28107eSBarry Smith 
1025d28107eSBarry Smith   t = second();
1036f2b61bcSKarl Rupp   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
1045d28107eSBarry Smith   t = 1.0E6 * (second() - t);
1055d28107eSBarry Smith 
1065d28107eSBarry Smith   /*   --- MAIN LOOP --- repeat test cases NTIMES times --- */
1075d28107eSBarry Smith 
1085d28107eSBarry Smith   scalar = 3.0;
10967595998SJunchao Zhang   for (k = 0; k < NTIMES; k++) {
1105d28107eSBarry Smith     times[0][k] = second();
111df4a11deSBarry Smith     /* should all these barriers be pulled outside of the time call? */
112d3ae85c4SBarry Smith 
1136f2b61bcSKarl Rupp     for (j = 0; j < N; j++) c[j] = a[j];
1145d28107eSBarry Smith     times[0][k] = second() - times[0][k];
1155d28107eSBarry Smith 
1165d28107eSBarry Smith     times[1][k] = second();
117d3ae85c4SBarry Smith 
1186f2b61bcSKarl Rupp     for (j = 0; j < N; j++) b[j] = scalar * c[j];
1195d28107eSBarry Smith     times[1][k] = second() - times[1][k];
1205d28107eSBarry Smith 
1215d28107eSBarry Smith     times[2][k] = second();
1226f2b61bcSKarl Rupp     for (j = 0; j < N; j++) c[j] = a[j] + b[j];
1235d28107eSBarry Smith     times[2][k] = second() - times[2][k];
1245d28107eSBarry Smith 
1255d28107eSBarry Smith     times[3][k] = second();
1266f2b61bcSKarl Rupp     for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j];
1275d28107eSBarry Smith     times[3][k] = second() - times[3][k];
1285d28107eSBarry Smith   }
1295d28107eSBarry Smith 
1305d28107eSBarry Smith   /*   --- SUMMARY --- */
1315d28107eSBarry Smith 
1326f2b61bcSKarl Rupp   for (k = 0; k < NTIMES; k++)
1336f2b61bcSKarl Rupp     for (j = 0; j < 4; j++) mintime[j] = MIN(mintime[j], times[j][k]);
1345d28107eSBarry Smith 
1356f2b61bcSKarl Rupp   for (j = 0; j < 4; j++) irate[j] = 1.0E-06 * bytes[j] / mintime[j];
136df4a11deSBarry Smith 
137df4a11deSBarry Smith   printf("Function      Rate (MB/s)\n");
138d3ae85c4SBarry Smith   for (j = 0; j < 4; j++) printf("%s%11.4f\n", label[j], irate[j]);
1395d28107eSBarry Smith   return 0;
1405d28107eSBarry Smith }
1415d28107eSBarry Smith 
1425d28107eSBarry Smith #define M 20
1435d28107eSBarry Smith 
checktick(void)144d1d3a73cSBarry Smith int checktick(void)
1455d28107eSBarry Smith {
1465d28107eSBarry Smith   int    i, minDelta, Delta;
1475d28107eSBarry Smith   double t1, t2, timesfound[M];
1485d28107eSBarry Smith 
1495d28107eSBarry Smith   /*  Collect a sequence of M unique time values from the system. */
1505d28107eSBarry Smith 
1515d28107eSBarry Smith   for (i = 0; i < M; i++) {
1525d28107eSBarry Smith     t1 = second();
1536f2b61bcSKarl Rupp     while (((t2 = second()) - t1) < 1.0E-6);
1545d28107eSBarry Smith     timesfound[i] = t1 = t2;
1555d28107eSBarry Smith   }
1565d28107eSBarry Smith 
1575d28107eSBarry Smith   /*
1580e3d61c9SBarry Smith   Determine the minimum difference between these M values.
1590e3d61c9SBarry Smith   This result will be our estimate (in microseconds) for the
1600e3d61c9SBarry Smith   clock granularity.
1615d28107eSBarry Smith  */
1625d28107eSBarry Smith 
1635d28107eSBarry Smith   minDelta = 1000000;
1645d28107eSBarry Smith   for (i = 1; i < M; i++) {
1655d28107eSBarry Smith     Delta    = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1]));
1665d28107eSBarry Smith     minDelta = MIN(minDelta, MAX(Delta, 0));
1675d28107eSBarry Smith   }
1685d28107eSBarry Smith 
16911cc89d2SBarry Smith   return minDelta;
1705d28107eSBarry Smith }
171