15d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 25d28107eSBarry Smith /* Program: Stream */ 35d28107eSBarry Smith /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 45d28107eSBarry Smith /* Original code developed by John D. McCalpin */ 55d28107eSBarry Smith /* Programmers: John D. McCalpin */ 65d28107eSBarry Smith /* Joe R. Zagar */ 75d28107eSBarry Smith /* */ 85d28107eSBarry Smith /* This program measures memory transfer rates in MB/s for simple */ 95d28107eSBarry Smith /* computational kernels coded in C. */ 105d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 115d28107eSBarry Smith /* Copyright 1991-2005: John D. McCalpin */ 125d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 135d28107eSBarry Smith /* License: */ 145d28107eSBarry Smith /* 1. You are free to use this program and/or to redistribute */ 155d28107eSBarry Smith /* this program. */ 165d28107eSBarry Smith /* 2. You are free to modify this program for your own use, */ 175d28107eSBarry Smith /* including commercial use, subject to the publication */ 185d28107eSBarry Smith /* restrictions in item 3. */ 195d28107eSBarry Smith /* 3. You are free to publish results obtained from running this */ 205d28107eSBarry Smith /* program, or from works that you derive from this program, */ 215d28107eSBarry Smith /* with the following limitations: */ 225d28107eSBarry Smith /* 3a. In order to be referred to as "STREAM benchmark results", */ 235d28107eSBarry Smith /* published results must be in conformance to the STREAM */ 245d28107eSBarry Smith /* Run Rules, (briefly reviewed below) published at */ 255d28107eSBarry Smith /* http://www.cs.virginia.edu/stream/ref.html */ 265d28107eSBarry Smith /* and incorporated herein by reference. */ 275d28107eSBarry Smith /* As the copyright holder, John McCalpin retains the */ 285d28107eSBarry Smith /* right to determine conformity with the Run Rules. */ 295d28107eSBarry Smith /* 3b. Results based on modified source code or on runs not in */ 305d28107eSBarry Smith /* accordance with the STREAM Run Rules must be clearly */ 315d28107eSBarry Smith /* labelled whenever they are published. Examples of */ 325d28107eSBarry Smith /* proper labelling include: */ 335d28107eSBarry Smith /* "tuned STREAM benchmark results" */ 345d28107eSBarry Smith /* "based on a variant of the STREAM benchmark code" */ 355d28107eSBarry Smith /* Other comparable, clear and reasonable labelling is */ 365d28107eSBarry Smith /* acceptable. */ 375d28107eSBarry Smith /* 3c. Submission of results to the STREAM benchmark web site */ 385d28107eSBarry Smith /* is encouraged, but not required. */ 395d28107eSBarry Smith /* 4. Use of this program or creation of derived works based on this */ 405d28107eSBarry Smith /* program constitutes acceptance of these licensing restrictions. */ 415d28107eSBarry Smith /* 5. Absolutely no warranty is expressed or implied. */ 425d28107eSBarry Smith /*-----------------------------------------------------------------------*/ 435d28107eSBarry Smith # include <stdio.h> 445d28107eSBarry Smith # include <math.h> 455d28107eSBarry Smith # include <limits.h> 460d04baf8SBarry Smith # include <float.h> 475d28107eSBarry Smith # include <sys/time.h> 485d28107eSBarry Smith 495d28107eSBarry Smith /* INSTRUCTIONS: 505d28107eSBarry Smith * 515d28107eSBarry Smith * 1) Stream requires a good bit of memory to run. Adjust the 525d28107eSBarry Smith * value of 'N' (below) to give a 'timing calibration' of 535d28107eSBarry Smith * at least 20 clock-ticks. This will provide rate estimates 545d28107eSBarry Smith * that should be good to about 5% precision. 555d28107eSBarry Smith */ 565d28107eSBarry Smith 57519f805aSKarl Rupp #if !defined(N) 585d28107eSBarry Smith # define N 2000000 595d28107eSBarry Smith #endif 60519f805aSKarl Rupp #if !defined(NTIMES) 61511c7730SShri Abhyankar # define NTIMES 50 625d28107eSBarry Smith #endif 63519f805aSKarl Rupp #if !defined(OFFSET) 645d28107eSBarry Smith # define OFFSET 0 655d28107eSBarry Smith #endif 665d28107eSBarry Smith 675d28107eSBarry Smith /* 685d28107eSBarry Smith * 3) Compile the code with full optimization. Many compilers 695d28107eSBarry Smith * generate unreasonably bad code before the optimizer tightens 705d28107eSBarry Smith * things up. If the results are unreasonably good, on the 715d28107eSBarry Smith * other hand, the optimizer might be too smart for me! 725d28107eSBarry Smith * 735d28107eSBarry Smith * Try compiling with: 745d28107eSBarry Smith * cc -O stream_omp.c -o stream_omp 755d28107eSBarry Smith * 765d28107eSBarry Smith * This is known to work on Cray, SGI, IBM, and Sun machines. 775d28107eSBarry Smith * 785d28107eSBarry Smith * 795d28107eSBarry Smith * 4) Mail the results to mccalpin@cs.virginia.edu 805d28107eSBarry Smith * Be sure to include: 815d28107eSBarry Smith * a) computer hardware model number and software revision 825d28107eSBarry Smith * b) the compiler flags 835d28107eSBarry Smith * c) all of the output from the test case. 845d28107eSBarry Smith * Thanks! 855d28107eSBarry Smith * 865d28107eSBarry Smith */ 875d28107eSBarry Smith 885d28107eSBarry Smith # define HLINE "-------------------------------------------------------------\n" 895d28107eSBarry Smith 90519f805aSKarl Rupp # if !defined(MIN) 915d28107eSBarry Smith # define MIN(x,y) ((x)<(y) ? (x) : (y)) 925d28107eSBarry Smith # endif 93519f805aSKarl Rupp # if !defined(MAX) 945d28107eSBarry Smith # define MAX(x,y) ((x)>(y) ? (x) : (y)) 955d28107eSBarry Smith # endif 965d28107eSBarry Smith 975d28107eSBarry Smith static double a[N+OFFSET], 985d28107eSBarry Smith b[N+OFFSET], 995d28107eSBarry Smith c[N+OFFSET]; 1005d28107eSBarry Smith 1015d28107eSBarry Smith static double avgtime[4] = {0}, maxtime[4] = {0}, 1025d28107eSBarry Smith mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 1035d28107eSBarry Smith 104fa8572e2SBarry Smith static const char *label[4] = {"Copy: ", "Scale: ","Add: ", "Triad: "}; 1055d28107eSBarry Smith 1065d28107eSBarry Smith static double bytes[4] = { 1075d28107eSBarry Smith 2 * sizeof(double) * N, 1085d28107eSBarry Smith 2 * sizeof(double) * N, 1095d28107eSBarry Smith 3 * sizeof(double) * N, 1105d28107eSBarry Smith 3 * sizeof(double) * N 1115d28107eSBarry Smith }; 1125d28107eSBarry Smith 1135d28107eSBarry Smith extern double mysecond(); 1145d28107eSBarry Smith extern void checkSTREAMresults(); 115519f805aSKarl Rupp #if defined(TUNED) 1165d28107eSBarry Smith extern void tuned_STREAM_Copy(); 1175d28107eSBarry Smith extern void tuned_STREAM_Scale(double scalar); 1185d28107eSBarry Smith extern void tuned_STREAM_Add(); 1195d28107eSBarry Smith extern void tuned_STREAM_Triad(double scalar); 1205d28107eSBarry Smith #endif 1215d28107eSBarry Smith extern int omp_get_num_threads(); 12201a79839SBarry Smith int main() 1235d28107eSBarry Smith { 1245d28107eSBarry Smith int quantum, checktick(); 1255d28107eSBarry Smith int BytesPerWord; 1265d28107eSBarry Smith register int j, k; 1275d28107eSBarry Smith double scalar, t, times[4][NTIMES]; 1285d28107eSBarry Smith 1295d28107eSBarry Smith /* --- SETUP --- determine precision and check timing --- */ 1305d28107eSBarry Smith 13180094aa7SBarry Smith /*printf(HLINE); 1325d28107eSBarry Smith printf("STREAM version $Revision: 5.9 $\n"); 13380094aa7SBarry Smith printf(HLINE); */ 1345d28107eSBarry Smith BytesPerWord = sizeof(double); 13580094aa7SBarry Smith /* printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 1365d28107eSBarry Smith BytesPerWord); 1375d28107eSBarry Smith 1385d28107eSBarry Smith printf(HLINE); 139519f805aSKarl Rupp #if defined(NO_LONG_LONG) 1405d28107eSBarry Smith printf("Array size = %d, Offset = %d\n" , N, OFFSET); 1415d28107eSBarry Smith #else 1425d28107eSBarry Smith printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 1435d28107eSBarry Smith #endif 1445d28107eSBarry Smith 1455d28107eSBarry Smith printf("Total memory required = %.1f MB.\n", 1465d28107eSBarry Smith (3.0 * BytesPerWord) * ((double) N / 1048576.0)); 1475d28107eSBarry Smith printf("Each test is run %d times, but only\n", NTIMES); 1485d28107eSBarry Smith printf("the *best* time for each is used.\n"); 1495d28107eSBarry Smith 15080094aa7SBarry Smith printf(HLINE); */ 1515d28107eSBarry Smith #pragma omp parallel 1525d28107eSBarry Smith { 1535d28107eSBarry Smith #pragma omp master 1545d28107eSBarry Smith { 1555d28107eSBarry Smith k = omp_get_num_threads(); 15680094aa7SBarry Smith printf(HLINE); 15780094aa7SBarry Smith printf ("Number of OpenMP Threads requested = %i\n",k); 1585d28107eSBarry Smith } 1595d28107eSBarry Smith } 1605d28107eSBarry Smith 16180094aa7SBarry Smith 1625d28107eSBarry Smith 1635d28107eSBarry Smith /* Get initial value for system clock. */ 1645d28107eSBarry Smith #pragma omp parallel for 1655d28107eSBarry Smith for (j=0; j<N; j++) { 1665d28107eSBarry Smith a[j] = 1.0; 1675d28107eSBarry Smith b[j] = 2.0; 1685d28107eSBarry Smith c[j] = 0.0; 1695d28107eSBarry Smith } 1705d28107eSBarry Smith 17180094aa7SBarry Smith /*printf(HLINE);*/ 1725d28107eSBarry Smith 173*6f2b61bcSKarl Rupp if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be " 17480094aa7SBarry Smith "%d microseconds.\n", quantum);*/ 1755d28107eSBarry Smith else { 17680094aa7SBarry Smith ; /* printf("Your clock granularity appears to be " 17780094aa7SBarry Smith "less than one microsecond.\n");*/ 1785d28107eSBarry Smith quantum = 1; 1795d28107eSBarry Smith } 1805d28107eSBarry Smith 1815d28107eSBarry Smith t = mysecond(); 1825d28107eSBarry Smith #pragma omp parallel for 183*6f2b61bcSKarl Rupp for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 1845d28107eSBarry Smith t = 1.0E6 * (mysecond() - t); 1855d28107eSBarry Smith 18680094aa7SBarry Smith /*printf("Each test below will take on the order" 1875d28107eSBarry Smith " of %d microseconds.\n", (int) t); 1885d28107eSBarry Smith printf(" (= %d clock ticks)\n", (int) (t/quantum)); 1895d28107eSBarry Smith printf("Increase the size of the arrays if this shows that\n"); 1905d28107eSBarry Smith printf("you are not getting at least 20 clock ticks per test.\n"); 1915d28107eSBarry Smith 19280094aa7SBarry Smith printf(HLINE);*/ 1935d28107eSBarry Smith 1945d28107eSBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 1955d28107eSBarry Smith 1965d28107eSBarry Smith scalar = 3.0; 1975d28107eSBarry Smith for (k=0; k<NTIMES; k++) 1985d28107eSBarry Smith { 1995d28107eSBarry Smith times[0][k] = mysecond(); 200519f805aSKarl Rupp #if defined(TUNED) 2015d28107eSBarry Smith tuned_STREAM_Copy(); 2025d28107eSBarry Smith #else 2035d28107eSBarry Smith #pragma omp parallel for 204*6f2b61bcSKarl Rupp for (j=0; j<N; j++) c[j] = a[j]; 2055d28107eSBarry Smith #endif 2065d28107eSBarry Smith times[0][k] = mysecond() - times[0][k]; 2075d28107eSBarry Smith 2085d28107eSBarry Smith times[1][k] = mysecond(); 209519f805aSKarl Rupp #if defined(TUNED) 2105d28107eSBarry Smith tuned_STREAM_Scale(scalar); 2115d28107eSBarry Smith #else 2125d28107eSBarry Smith #pragma omp parallel for 213*6f2b61bcSKarl Rupp for (j=0; j<N; j++) b[j] = scalar*c[j]; 2145d28107eSBarry Smith #endif 2155d28107eSBarry Smith times[1][k] = mysecond() - times[1][k]; 2165d28107eSBarry Smith 2175d28107eSBarry Smith times[2][k] = mysecond(); 218519f805aSKarl Rupp #if defined(TUNED) 2195d28107eSBarry Smith tuned_STREAM_Add(); 2205d28107eSBarry Smith #else 2215d28107eSBarry Smith #pragma omp parallel for 222*6f2b61bcSKarl Rupp for (j=0; j<N; j++) c[j] = a[j]+b[j]; 2235d28107eSBarry Smith #endif 2245d28107eSBarry Smith times[2][k] = mysecond() - times[2][k]; 2255d28107eSBarry Smith 2265d28107eSBarry Smith times[3][k] = mysecond(); 227519f805aSKarl Rupp #if defined(TUNED) 2285d28107eSBarry Smith tuned_STREAM_Triad(scalar); 2295d28107eSBarry Smith #else 2305d28107eSBarry Smith #pragma omp parallel for 231*6f2b61bcSKarl Rupp for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 2325d28107eSBarry Smith #endif 2335d28107eSBarry Smith times[3][k] = mysecond() - times[3][k]; 2345d28107eSBarry Smith } 2355d28107eSBarry Smith 2365d28107eSBarry Smith /* --- SUMMARY --- */ 2375d28107eSBarry Smith 2385d28107eSBarry Smith for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ 2395d28107eSBarry Smith for (j=0; j<4; j++) 2405d28107eSBarry Smith { 2415d28107eSBarry Smith avgtime[j] = avgtime[j] + times[j][k]; 2425d28107eSBarry Smith mintime[j] = MIN(mintime[j], times[j][k]); 2435d28107eSBarry Smith maxtime[j] = MAX(maxtime[j], times[j][k]); 2445d28107eSBarry Smith } 2455d28107eSBarry Smith 246fa8572e2SBarry Smith printf("Function Rate (MB/s) \n"); 2475d28107eSBarry Smith for (j=0; j<4; j++) { 2485d28107eSBarry Smith avgtime[j] = avgtime[j]/(double)(NTIMES-1); 2495d28107eSBarry Smith 250fa8572e2SBarry Smith printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); 2515d28107eSBarry Smith } 25280094aa7SBarry Smith /* printf(HLINE);*/ 2535d28107eSBarry Smith 2545d28107eSBarry Smith /* --- Check Results --- */ 2555d28107eSBarry Smith checkSTREAMresults(); 25680094aa7SBarry Smith /* printf(HLINE);*/ 2575d28107eSBarry Smith 2585d28107eSBarry Smith return 0; 2595d28107eSBarry Smith } 2605d28107eSBarry Smith 2615d28107eSBarry Smith # define M 20 2625d28107eSBarry Smith 263*6f2b61bcSKarl Rupp int checktick() 2645d28107eSBarry Smith { 2655d28107eSBarry Smith int i, minDelta, Delta; 2665d28107eSBarry Smith double t1, t2, timesfound[M]; 2675d28107eSBarry Smith 2685d28107eSBarry Smith /* Collect a sequence of M unique time values from the system. */ 2695d28107eSBarry Smith 2705d28107eSBarry Smith for (i = 0; i < M; i++) { 2715d28107eSBarry Smith t1 = mysecond(); 272*6f2b61bcSKarl Rupp while (((t2=mysecond()) - t1) < 1.0E-6) ; 2735d28107eSBarry Smith timesfound[i] = t1 = t2; 2745d28107eSBarry Smith } 2755d28107eSBarry Smith 2765d28107eSBarry Smith /* 2775d28107eSBarry Smith * Determine the minimum difference between these M values. 2785d28107eSBarry Smith * This result will be our estimate (in microseconds) for the 2795d28107eSBarry Smith * clock granularity. 2805d28107eSBarry Smith */ 2815d28107eSBarry Smith 2825d28107eSBarry Smith minDelta = 1000000; 2835d28107eSBarry Smith for (i = 1; i < M; i++) { 2845d28107eSBarry Smith Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 2855d28107eSBarry Smith minDelta = MIN(minDelta, MAX(Delta,0)); 2865d28107eSBarry Smith } 2875d28107eSBarry Smith 2885d28107eSBarry Smith return(minDelta); 2895d28107eSBarry Smith } 2905d28107eSBarry Smith 2915d28107eSBarry Smith 2925d28107eSBarry Smith 2935d28107eSBarry Smith /* A gettimeofday routine to give access to the wall 2945d28107eSBarry Smith clock timer on most UNIX-like systems. */ 2955d28107eSBarry Smith 2965d28107eSBarry Smith #include <sys/time.h> 2975d28107eSBarry Smith 2985d28107eSBarry Smith double mysecond() 2995d28107eSBarry Smith { 3005d28107eSBarry Smith struct timeval tp; 3015d28107eSBarry Smith struct timezone tzp; 3025d28107eSBarry Smith int i; 3035d28107eSBarry Smith 3045d28107eSBarry Smith i = gettimeofday(&tp,&tzp); 3055d28107eSBarry Smith return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6); 3065d28107eSBarry Smith } 3075d28107eSBarry Smith 3085d28107eSBarry Smith void checkSTREAMresults() 3095d28107eSBarry Smith { 3105d28107eSBarry Smith double aj,bj,cj,scalar; 3115d28107eSBarry Smith double asum,bsum,csum; 3125d28107eSBarry Smith double epsilon; 3135d28107eSBarry Smith int j,k; 3145d28107eSBarry Smith 3155d28107eSBarry Smith /* reproduce initialization */ 3165d28107eSBarry Smith aj = 1.0; 3175d28107eSBarry Smith bj = 2.0; 3185d28107eSBarry Smith cj = 0.0; 3195d28107eSBarry Smith /* a[] is modified during timing check */ 3205d28107eSBarry Smith aj = 2.0E0 * aj; 3215d28107eSBarry Smith /* now execute timing loop */ 3225d28107eSBarry Smith scalar = 3.0; 3235d28107eSBarry Smith for (k=0; k<NTIMES; k++) 3245d28107eSBarry Smith { 3255d28107eSBarry Smith cj = aj; 3265d28107eSBarry Smith bj = scalar*cj; 3275d28107eSBarry Smith cj = aj+bj; 3285d28107eSBarry Smith aj = bj+scalar*cj; 3295d28107eSBarry Smith } 3305d28107eSBarry Smith aj = aj * (double) (N); 3315d28107eSBarry Smith bj = bj * (double) (N); 3325d28107eSBarry Smith cj = cj * (double) (N); 3335d28107eSBarry Smith 3345d28107eSBarry Smith asum = 0.0; 3355d28107eSBarry Smith bsum = 0.0; 3365d28107eSBarry Smith csum = 0.0; 3375d28107eSBarry Smith for (j=0; j<N; j++) { 3385d28107eSBarry Smith asum += a[j]; 3395d28107eSBarry Smith bsum += b[j]; 3405d28107eSBarry Smith csum += c[j]; 3415d28107eSBarry Smith } 342519f805aSKarl Rupp #if defined(VERBOSE) 3435d28107eSBarry Smith printf ("Results Comparison: \n"); 3445d28107eSBarry Smith printf (" Expected : %f %f %f \n",aj,bj,cj); 3455d28107eSBarry Smith printf (" Observed : %f %f %f \n",asum,bsum,csum); 3465d28107eSBarry Smith #endif 3475d28107eSBarry Smith 348519f805aSKarl Rupp #if !defined(abs) 3495d28107eSBarry Smith #define abs(a) ((a) >= 0 ? (a) : -(a)) 3505d28107eSBarry Smith #endif 3515d28107eSBarry Smith epsilon = 1.e-8; 3525d28107eSBarry Smith 3535d28107eSBarry Smith if (abs(aj-asum)/asum > epsilon) { 3545d28107eSBarry Smith printf ("Failed Validation on array a[]\n"); 3555d28107eSBarry Smith printf (" Expected : %f \n",aj); 3565d28107eSBarry Smith printf (" Observed : %f \n",asum); 357*6f2b61bcSKarl Rupp } else if (abs(bj-bsum)/bsum > epsilon) { 3585d28107eSBarry Smith printf ("Failed Validation on array b[]\n"); 3595d28107eSBarry Smith printf (" Expected : %f \n",bj); 3605d28107eSBarry Smith printf (" Observed : %f \n",bsum); 361*6f2b61bcSKarl Rupp } else if (abs(cj-csum)/csum > epsilon) { 3625d28107eSBarry Smith printf ("Failed Validation on array c[]\n"); 3635d28107eSBarry Smith printf (" Expected : %f \n",cj); 3645d28107eSBarry Smith printf (" Observed : %f \n",csum); 365*6f2b61bcSKarl Rupp } else ; /* printf ("Solution Validates\n"); */ 3665d28107eSBarry Smith } 3675d28107eSBarry Smith 3685d28107eSBarry Smith void tuned_STREAM_Copy() 3695d28107eSBarry Smith { 3705d28107eSBarry Smith int j; 3715d28107eSBarry Smith #pragma omp parallel for 372*6f2b61bcSKarl Rupp for (j=0; j<N; j++) c[j] = a[j]; 3735d28107eSBarry Smith } 3745d28107eSBarry Smith 3755d28107eSBarry Smith void tuned_STREAM_Scale(double scalar) 3765d28107eSBarry Smith { 3775d28107eSBarry Smith int j; 3785d28107eSBarry Smith #pragma omp parallel for 379*6f2b61bcSKarl Rupp for (j=0; j<N; j++) b[j] = scalar*c[j]; 3805d28107eSBarry Smith } 3815d28107eSBarry Smith 3825d28107eSBarry Smith void tuned_STREAM_Add() 3835d28107eSBarry Smith { 3845d28107eSBarry Smith int j; 3855d28107eSBarry Smith #pragma omp parallel for 386*6f2b61bcSKarl Rupp for (j=0; j<N; j++) c[j] = a[j]+b[j]; 3875d28107eSBarry Smith } 3885d28107eSBarry Smith 3895d28107eSBarry Smith void tuned_STREAM_Triad(double scalar) 3905d28107eSBarry Smith { 3915d28107eSBarry Smith int j; 3925d28107eSBarry Smith #pragma omp parallel for 393*6f2b61bcSKarl Rupp for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 3945d28107eSBarry Smith } 395