15d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 25d28107eSBarry Smith /* Program: Stream */ 35d28107eSBarry Smith /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 45d28107eSBarry Smith /* Original code developed by John D. McCalpin */ 55d28107eSBarry Smith /* Programmers: John D. McCalpin */ 65d28107eSBarry Smith /* Joe R. Zagar */ 75d28107eSBarry Smith /* */ 85d28107eSBarry Smith /* This program measures memory transfer rates in MB/s for simple */ 95d28107eSBarry Smith /* computational kernels coded in C. */ 105d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 115d28107eSBarry Smith /* Copyright 1991-2005: John D. McCalpin */ 125d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 135d28107eSBarry Smith /* License: */ 145d28107eSBarry Smith /* 1. You are free to use this program and/or to redistribute */ 155d28107eSBarry Smith /* this program. */ 165d28107eSBarry Smith /* 2. You are free to modify this program for your own use, */ 175d28107eSBarry Smith /* including commercial use, subject to the publication */ 185d28107eSBarry Smith /* restrictions in item 3. */ 195d28107eSBarry Smith /* 3. You are free to publish results obtained from running this */ 205d28107eSBarry Smith /* program, or from works that you derive from this program, */ 215d28107eSBarry Smith /* with the following limitations: */ 225d28107eSBarry Smith /* 3a. In order to be referred to as "STREAM benchmark results", */ 235d28107eSBarry Smith /* published results must be in conformance to the STREAM */ 245d28107eSBarry Smith /* Run Rules, (briefly reviewed below) published at */ 255d28107eSBarry Smith /* http://www.cs.virginia.edu/stream/ref.html */ 265d28107eSBarry Smith /* and incorporated herein by reference. */ 275d28107eSBarry Smith /* As the copyright holder, John McCalpin retains the */ 285d28107eSBarry Smith /* right to determine conformity with the Run Rules. */ 295d28107eSBarry Smith /* 3b. Results based on modified source code or on runs not in */ 305d28107eSBarry Smith /* accordance with the STREAM Run Rules must be clearly */ 315d28107eSBarry Smith /* labelled whenever they are published. Examples of */ 325d28107eSBarry Smith /* proper labelling include: */ 335d28107eSBarry Smith /* "tuned STREAM benchmark results" */ 345d28107eSBarry Smith /* "based on a variant of the STREAM benchmark code" */ 355d28107eSBarry Smith /* Other comparable, clear and reasonable labelling is */ 365d28107eSBarry Smith /* acceptable. */ 375d28107eSBarry Smith /* 3c. Submission of results to the STREAM benchmark web site */ 385d28107eSBarry Smith /* is encouraged, but not required. */ 395d28107eSBarry Smith /* 4. Use of this program or creation of derived works based on this */ 405d28107eSBarry Smith /* program constitutes acceptance of these licensing restrictions. */ 415d28107eSBarry Smith /* 5. Absolutely no warranty is expressed or implied. */ 425d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 435d28107eSBarry Smith # include <stdio.h> 445d28107eSBarry Smith # include <math.h> 455d28107eSBarry Smith # include <limits.h> 460d04baf8SBarry Smith # include <float.h> 475d28107eSBarry Smith # include <sys/time.h> 48*4198fb66SBarry Smith #include <stdlib.h> 495d28107eSBarry Smith 505d28107eSBarry Smith /* INSTRUCTIONS: 515d28107eSBarry Smith * 525d28107eSBarry Smith * 1) Stream requires a good bit of memory to run. Adjust the 535d28107eSBarry Smith * value of 'N' (below) to give a 'timing calibration' of 545d28107eSBarry Smith * at least 20 clock-ticks. This will provide rate estimates 555d28107eSBarry Smith * that should be good to about 5% precision. 565d28107eSBarry Smith */ 575d28107eSBarry Smith 58519f805aSKarl Rupp #if !defined(N) 595d28107eSBarry Smith # define N 2000000 605d28107eSBarry Smith #endif 61519f805aSKarl Rupp #if !defined(NTIMES) 62511c7730SShri Abhyankar # define NTIMES 50 635d28107eSBarry Smith #endif 64519f805aSKarl Rupp #if !defined(OFFSET) 655d28107eSBarry Smith # define OFFSET 0 665d28107eSBarry Smith #endif 675d28107eSBarry Smith 685d28107eSBarry Smith /* 695d28107eSBarry Smith * 3) Compile the code with full optimization. Many compilers 705d28107eSBarry Smith * generate unreasonably bad code before the optimizer tightens 715d28107eSBarry Smith * things up. If the results are unreasonably good, on the 725d28107eSBarry Smith * other hand, the optimizer might be too smart for me! 735d28107eSBarry Smith * 745d28107eSBarry Smith * Try compiling with: 755d28107eSBarry Smith * cc -O stream_omp.c -o stream_omp 765d28107eSBarry Smith * 775d28107eSBarry Smith * This is known to work on Cray, SGI, IBM, and Sun machines. 785d28107eSBarry Smith * 795d28107eSBarry Smith * 805d28107eSBarry Smith * 4) Mail the results to mccalpin@cs.virginia.edu 815d28107eSBarry Smith * Be sure to include: 825d28107eSBarry Smith * a) computer hardware model number and software revision 835d28107eSBarry Smith * b) the compiler flags 845d28107eSBarry Smith * c) all of the output from the test case. 855d28107eSBarry Smith * Thanks! 865d28107eSBarry Smith * 875d28107eSBarry Smith */ 885d28107eSBarry Smith 895d28107eSBarry Smith # define HLINE "-------------------------------------------------------------\n" 905d28107eSBarry Smith 91519f805aSKarl Rupp # if !defined(MIN) 925d28107eSBarry Smith # define MIN(x,y) ((x)<(y) ? (x) : (y)) 935d28107eSBarry Smith # endif 94519f805aSKarl Rupp # if !defined(MAX) 955d28107eSBarry Smith # define MAX(x,y) ((x)>(y) ? (x) : (y)) 965d28107eSBarry Smith # endif 975d28107eSBarry Smith 985d28107eSBarry Smith static double a[N+OFFSET], 995d28107eSBarry Smith b[N+OFFSET], 1005d28107eSBarry Smith c[N+OFFSET]; 1015d28107eSBarry Smith 1025d28107eSBarry Smith static double avgtime[4] = {0}, maxtime[4] = {0}, 1035d28107eSBarry Smith mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 1045d28107eSBarry Smith 1055d28107eSBarry Smith 106*4198fb66SBarry Smith static double bytes[1] = { 1075d28107eSBarry Smith 3 * sizeof(double) * N 1085d28107eSBarry Smith }; 1095d28107eSBarry Smith 1105d28107eSBarry Smith extern double mysecond(); 1115d28107eSBarry Smith extern int omp_get_num_threads(); 11201a79839SBarry Smith int main() 1135d28107eSBarry Smith { 1145d28107eSBarry Smith int quantum, checktick(); 1155d28107eSBarry Smith register int j, k; 116*4198fb66SBarry Smith double scalar, t, times[4][NTIMES],rate; 117*4198fb66SBarry Smith int size; 118*4198fb66SBarry Smith char *env; 119*4198fb66SBarry Smith FILE *fd; 1205d28107eSBarry Smith 121*4198fb66SBarry Smith env = getenv("OMP_NUM_THREADS"); 122*4198fb66SBarry Smith sscanf(env,"%d",&size); 1235d28107eSBarry Smith /* --- SETUP --- determine precision and check timing --- */ 1245d28107eSBarry Smith 12580094aa7SBarry Smith /*printf(HLINE); 1265d28107eSBarry Smith printf("STREAM version $Revision: 5.9 $\n"); 12780094aa7SBarry Smith printf(HLINE); */ 12880094aa7SBarry Smith /* printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 1295d28107eSBarry Smith BytesPerWord); 1305d28107eSBarry Smith 1315d28107eSBarry Smith printf(HLINE); 132519f805aSKarl Rupp #if defined(NO_LONG_LONG) 1335d28107eSBarry Smith printf("Array size = %d, Offset = %d\n" , N, OFFSET); 1345d28107eSBarry Smith #else 1355d28107eSBarry Smith printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 1365d28107eSBarry Smith #endif 1375d28107eSBarry Smith 1385d28107eSBarry Smith printf("Total memory required = %.1f MB.\n", 1395d28107eSBarry Smith (3.0 * BytesPerWord) * ((double) N / 1048576.0)); 1405d28107eSBarry Smith printf("Each test is run %d times, but only\n", NTIMES); 1415d28107eSBarry Smith printf("the *best* time for each is used.\n"); 1425d28107eSBarry Smith 14380094aa7SBarry Smith printf(HLINE); */ 1445d28107eSBarry Smith 14580094aa7SBarry Smith 1465d28107eSBarry Smith 1475d28107eSBarry Smith /* Get initial value for system clock. */ 1485d28107eSBarry Smith #pragma omp parallel for 1495d28107eSBarry Smith for (j=0; j<N; j++) { 1505d28107eSBarry Smith a[j] = 1.0; 1515d28107eSBarry Smith b[j] = 2.0; 1525d28107eSBarry Smith c[j] = 0.0; 1535d28107eSBarry Smith } 1545d28107eSBarry Smith 15580094aa7SBarry Smith /*printf(HLINE);*/ 1565d28107eSBarry Smith 1576f2b61bcSKarl Rupp if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be " 15880094aa7SBarry Smith "%d microseconds.\n", quantum);*/ 1595d28107eSBarry Smith else { 16080094aa7SBarry Smith ; /* printf("Your clock granularity appears to be " 16180094aa7SBarry Smith "less than one microsecond.\n");*/ 1625d28107eSBarry Smith quantum = 1; 1635d28107eSBarry Smith } 1645d28107eSBarry Smith 1655d28107eSBarry Smith t = mysecond(); 1665d28107eSBarry Smith #pragma omp parallel for 1676f2b61bcSKarl Rupp for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 1685d28107eSBarry Smith t = 1.0E6 * (mysecond() - t); 1695d28107eSBarry Smith 17080094aa7SBarry Smith /*printf("Each test below will take on the order" 1715d28107eSBarry Smith " of %d microseconds.\n", (int) t); 1725d28107eSBarry Smith printf(" (= %d clock ticks)\n", (int) (t/quantum)); 1735d28107eSBarry Smith printf("Increase the size of the arrays if this shows that\n"); 1745d28107eSBarry Smith printf("you are not getting at least 20 clock ticks per test.\n"); 1755d28107eSBarry Smith 17680094aa7SBarry Smith printf(HLINE);*/ 1775d28107eSBarry Smith 1785d28107eSBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 1795d28107eSBarry Smith 1805d28107eSBarry Smith scalar = 3.0; 1815d28107eSBarry Smith for (k=0; k<NTIMES; k++) 1825d28107eSBarry Smith { 1835d28107eSBarry Smith times[0][k] = mysecond(); 1845d28107eSBarry Smith #pragma omp parallel for 1856f2b61bcSKarl Rupp for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 186*4198fb66SBarry Smith times[0][k] = mysecond() - times[0][k]; 1875d28107eSBarry Smith } 1885d28107eSBarry Smith 1895d28107eSBarry Smith /* --- SUMMARY --- */ 1905d28107eSBarry Smith 191*4198fb66SBarry Smith for (k=1; k<NTIMES; k++) { /* note -- skip first iteration */ 192*4198fb66SBarry Smith for (j=0; j<1; j++) 1935d28107eSBarry Smith { 1945d28107eSBarry Smith avgtime[j] = avgtime[j] + times[j][k]; 1955d28107eSBarry Smith mintime[j] = MIN(mintime[j], times[j][k]); 1965d28107eSBarry Smith maxtime[j] = MAX(maxtime[j], times[j][k]); 1975d28107eSBarry Smith } 1985d28107eSBarry Smith } 1995d28107eSBarry Smith 200*4198fb66SBarry Smith rate = 1.0E-06 * bytes[0]/mintime[0]; 201*4198fb66SBarry Smith 202*4198fb66SBarry Smith if (size == 1) { 203*4198fb66SBarry Smith printf("%d %11.4f Rate (MB/s)\n",size, rate); 204*4198fb66SBarry Smith fd = fopen("flops","w"); 205*4198fb66SBarry Smith fprintf(fd,"%g\n",rate); 206*4198fb66SBarry Smith fclose(fd); 207*4198fb66SBarry Smith } else { 208*4198fb66SBarry Smith double prate; 209*4198fb66SBarry Smith fd = fopen("flops","r"); 210*4198fb66SBarry Smith fscanf(fd,"%lg",&prate); 211*4198fb66SBarry Smith fclose(fd); 212*4198fb66SBarry Smith printf("%d %11.4f Rate (MB/s) %g \n", size, rate,rate/prate); 213*4198fb66SBarry Smith } 2145d28107eSBarry Smith 2155d28107eSBarry Smith return 0; 2165d28107eSBarry Smith } 2175d28107eSBarry Smith 2185d28107eSBarry Smith # define M 20 2195d28107eSBarry Smith 2206f2b61bcSKarl Rupp int checktick() 2215d28107eSBarry Smith { 2225d28107eSBarry Smith int i, minDelta, Delta; 2235d28107eSBarry Smith double t1, t2, timesfound[M]; 2245d28107eSBarry Smith 2255d28107eSBarry Smith /* Collect a sequence of M unique time values from the system. */ 2265d28107eSBarry Smith 2275d28107eSBarry Smith for (i = 0; i < M; i++) { 2285d28107eSBarry Smith t1 = mysecond(); 2296f2b61bcSKarl Rupp while (((t2=mysecond()) - t1) < 1.0E-6) ; 2305d28107eSBarry Smith timesfound[i] = t1 = t2; 2315d28107eSBarry Smith } 2325d28107eSBarry Smith 2335d28107eSBarry Smith /* 2345d28107eSBarry Smith * Determine the minimum difference between these M values. 2355d28107eSBarry Smith * This result will be our estimate (in microseconds) for the 2365d28107eSBarry Smith * clock granularity. 2375d28107eSBarry Smith */ 2385d28107eSBarry Smith 2395d28107eSBarry Smith minDelta = 1000000; 2405d28107eSBarry Smith for (i = 1; i < M; i++) { 2415d28107eSBarry Smith Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 2425d28107eSBarry Smith minDelta = MIN(minDelta, MAX(Delta,0)); 2435d28107eSBarry Smith } 2445d28107eSBarry Smith 2455d28107eSBarry Smith return(minDelta); 2465d28107eSBarry Smith } 2475d28107eSBarry Smith 2485d28107eSBarry Smith 2495d28107eSBarry Smith 2505d28107eSBarry Smith /* A gettimeofday routine to give access to the wall 2515d28107eSBarry Smith clock timer on most UNIX-like systems. */ 2525d28107eSBarry Smith 2535d28107eSBarry Smith #include <sys/time.h> 2545d28107eSBarry Smith 2555d28107eSBarry Smith double mysecond() 2565d28107eSBarry Smith { 2575d28107eSBarry Smith struct timeval tp; 2585d28107eSBarry Smith struct timezone tzp; 2595d28107eSBarry Smith 260*4198fb66SBarry Smith (void) gettimeofday(&tp,&tzp); 2615d28107eSBarry Smith return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6); 2625d28107eSBarry Smith } 2635d28107eSBarry Smith 264