15d28107eSBarry Smith #include <sys/time.h>
25d28107eSBarry Smith /* int gettimeofday(struct timeval *tp, struct timezone *tzp); */
35d28107eSBarry Smith
second()45d28107eSBarry Smith double second()
55d28107eSBarry Smith {
65d28107eSBarry Smith /* struct timeval { long tv_sec;
75d28107eSBarry Smith long tv_usec; };
85d28107eSBarry Smith
95d28107eSBarry Smith struct timezone { int tz_minuteswest;
105d28107eSBarry Smith int tz_dsttime; }; */
115d28107eSBarry Smith
125d28107eSBarry Smith struct timeval tp;
135d28107eSBarry Smith struct timezone tzp;
145d28107eSBarry Smith int i;
155d28107eSBarry Smith
165d28107eSBarry Smith i = gettimeofday(&tp, &tzp);
175d28107eSBarry Smith return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
185d28107eSBarry Smith }
195d28107eSBarry Smith #include <stdio.h>
205d28107eSBarry Smith #include <math.h>
215d28107eSBarry Smith #include <limits.h>
220d04baf8SBarry Smith #include <float.h>
235d28107eSBarry Smith #include <sys/time.h>
245d28107eSBarry Smith
255d28107eSBarry Smith /*
260e3d61c9SBarry Smith Program: Stream
270e3d61c9SBarry Smith Programmer: Joe R. Zagar
280e3d61c9SBarry Smith Revision: 4.0-BETA, October 24, 1995
290e3d61c9SBarry Smith Original code developed by John D. McCalpin
300e3d61c9SBarry Smith
310e3d61c9SBarry Smith This program measures memory transfer rates in MB/s for simple
320e3d61c9SBarry Smith computational kernels coded in C. These numbers reveal the quality
330e3d61c9SBarry Smith of code generation for simple uncacheable kernels as well as showing
340e3d61c9SBarry Smith the cost of floating-point operations relative to memory accesses.
350e3d61c9SBarry Smith
360e3d61c9SBarry Smith INSTRUCTIONS:
370e3d61c9SBarry Smith
380e3d61c9SBarry Smith 1) Stream requires a good bit of memory to run. Adjust the
390e3d61c9SBarry Smith value of 'N' (below) to give a 'timing calibration' of
400e3d61c9SBarry Smith at least 20 clock-ticks. This will provide rate estimates
410e3d61c9SBarry Smith that should be good to about 5% precision.
425d28107eSBarry Smith */
435d28107eSBarry Smith
44d3ae85c4SBarry Smith #define N 200000
455d28107eSBarry Smith #define NTIMES 50
465d28107eSBarry Smith #define OFFSET 0
475d28107eSBarry Smith
485d28107eSBarry Smith /*
490e3d61c9SBarry Smith 3) Compile the code with full optimization. Many compilers
500e3d61c9SBarry Smith generate unreasonably bad code before the optimizer tightens
510e3d61c9SBarry Smith things up. If the results are unreasonably good, on the
520e3d61c9SBarry Smith other hand, the optimizer might be too smart for me!
530e3d61c9SBarry Smith
540e3d61c9SBarry Smith Try compiling with:
550e3d61c9SBarry Smith cc -O stream_d.c second.c -o stream_d -lm
560e3d61c9SBarry Smith
570e3d61c9SBarry Smith This is known to work on Cray, SGI, IBM, and Sun machines.
580e3d61c9SBarry Smith
590e3d61c9SBarry Smith 4) Mail the results to mccalpin@cs.virginia.edu
600e3d61c9SBarry Smith Be sure to include:
610e3d61c9SBarry Smith a) computer hardware model number and software revision
620e3d61c9SBarry Smith b) the compiler flags
630e3d61c9SBarry Smith c) all of the output from the test case.
640e3d61c9SBarry Smith Thanks!
650e3d61c9SBarry Smith
665d28107eSBarry Smith */
675d28107eSBarry Smith
685d28107eSBarry Smith #define HLINE "-------------------------------------------------------------\n"
695d28107eSBarry Smith
70*beceaeb6SBarry Smith #if !defined(MIN)
715d28107eSBarry Smith #define MIN(x, y) ((x) < (y) ? (x) : (y))
725d28107eSBarry Smith #endif
73*beceaeb6SBarry Smith #if !defined(MAX)
745d28107eSBarry Smith #define MAX(x, y) ((x) > (y) ? (x) : (y))
755d28107eSBarry Smith #endif
765d28107eSBarry Smith
7767595998SJunchao Zhang static double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET];
785d28107eSBarry Smith /*double *a,*b,*c;*/
795d28107eSBarry Smith
80df4a11deSBarry Smith static double mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
815d28107eSBarry Smith
82df4a11deSBarry Smith static const char *label[4] = {"Copy: ", "Scale: ", "Add: ", "Triad: "};
835d28107eSBarry Smith
8467595998SJunchao Zhang static double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N};
855d28107eSBarry Smith
865d28107eSBarry Smith extern double second();
875d28107eSBarry Smith
main(int argc,char ** args)8801a79839SBarry Smith int main(int argc, char **args)
895d28107eSBarry Smith {
90d1d3a73cSBarry Smith int checktick(void);
915d28107eSBarry Smith register int j, k;
92d3ae85c4SBarry Smith double scalar, t, times[4][NTIMES], irate[4];
93df4a11deSBarry Smith
945d28107eSBarry Smith /* --- SETUP --- determine precision and check timing --- */
955d28107eSBarry Smith
965d28107eSBarry Smith for (j = 0; j < N; j++) {
975d28107eSBarry Smith a[j] = 1.0;
985d28107eSBarry Smith b[j] = 2.0;
995d28107eSBarry Smith c[j] = 0.0;
1005d28107eSBarry Smith }
1015d28107eSBarry Smith
1025d28107eSBarry Smith t = second();
1036f2b61bcSKarl Rupp for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
1045d28107eSBarry Smith t = 1.0E6 * (second() - t);
1055d28107eSBarry Smith
1065d28107eSBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
1075d28107eSBarry Smith
1085d28107eSBarry Smith scalar = 3.0;
10967595998SJunchao Zhang for (k = 0; k < NTIMES; k++) {
1105d28107eSBarry Smith times[0][k] = second();
111df4a11deSBarry Smith /* should all these barriers be pulled outside of the time call? */
112d3ae85c4SBarry Smith
1136f2b61bcSKarl Rupp for (j = 0; j < N; j++) c[j] = a[j];
1145d28107eSBarry Smith times[0][k] = second() - times[0][k];
1155d28107eSBarry Smith
1165d28107eSBarry Smith times[1][k] = second();
117d3ae85c4SBarry Smith
1186f2b61bcSKarl Rupp for (j = 0; j < N; j++) b[j] = scalar * c[j];
1195d28107eSBarry Smith times[1][k] = second() - times[1][k];
1205d28107eSBarry Smith
1215d28107eSBarry Smith times[2][k] = second();
1226f2b61bcSKarl Rupp for (j = 0; j < N; j++) c[j] = a[j] + b[j];
1235d28107eSBarry Smith times[2][k] = second() - times[2][k];
1245d28107eSBarry Smith
1255d28107eSBarry Smith times[3][k] = second();
1266f2b61bcSKarl Rupp for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j];
1275d28107eSBarry Smith times[3][k] = second() - times[3][k];
1285d28107eSBarry Smith }
1295d28107eSBarry Smith
1305d28107eSBarry Smith /* --- SUMMARY --- */
1315d28107eSBarry Smith
1326f2b61bcSKarl Rupp for (k = 0; k < NTIMES; k++)
1336f2b61bcSKarl Rupp for (j = 0; j < 4; j++) mintime[j] = MIN(mintime[j], times[j][k]);
1345d28107eSBarry Smith
1356f2b61bcSKarl Rupp for (j = 0; j < 4; j++) irate[j] = 1.0E-06 * bytes[j] / mintime[j];
136df4a11deSBarry Smith
137df4a11deSBarry Smith printf("Function Rate (MB/s)\n");
138d3ae85c4SBarry Smith for (j = 0; j < 4; j++) printf("%s%11.4f\n", label[j], irate[j]);
1395d28107eSBarry Smith return 0;
1405d28107eSBarry Smith }
1415d28107eSBarry Smith
1425d28107eSBarry Smith #define M 20
1435d28107eSBarry Smith
checktick(void)144d1d3a73cSBarry Smith int checktick(void)
1455d28107eSBarry Smith {
1465d28107eSBarry Smith int i, minDelta, Delta;
1475d28107eSBarry Smith double t1, t2, timesfound[M];
1485d28107eSBarry Smith
1495d28107eSBarry Smith /* Collect a sequence of M unique time values from the system. */
1505d28107eSBarry Smith
1515d28107eSBarry Smith for (i = 0; i < M; i++) {
1525d28107eSBarry Smith t1 = second();
1536f2b61bcSKarl Rupp while (((t2 = second()) - t1) < 1.0E-6);
1545d28107eSBarry Smith timesfound[i] = t1 = t2;
1555d28107eSBarry Smith }
1565d28107eSBarry Smith
1575d28107eSBarry Smith /*
1580e3d61c9SBarry Smith Determine the minimum difference between these M values.
1590e3d61c9SBarry Smith This result will be our estimate (in microseconds) for the
1600e3d61c9SBarry Smith clock granularity.
1615d28107eSBarry Smith */
1625d28107eSBarry Smith
1635d28107eSBarry Smith minDelta = 1000000;
1645d28107eSBarry Smith for (i = 1; i < M; i++) {
1655d28107eSBarry Smith Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1]));
1665d28107eSBarry Smith minDelta = MIN(minDelta, MAX(Delta, 0));
1675d28107eSBarry Smith }
1685d28107eSBarry Smith
16911cc89d2SBarry Smith return minDelta;
1705d28107eSBarry Smith }
171