1 /*-----------------------------------------------------------------------*/ 2 /* Program: Stream */ 3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 4 /* Original code developed by John D. McCalpin */ 5 /* Programmers: John D. McCalpin */ 6 /* Joe R. Zagar */ 7 /* */ 8 /* This program measures memory transfer rates in MB/s for simple */ 9 /* computational kernels coded in C. */ 10 /*-----------------------------------------------------------------------*/ 11 /* Copyright 1991-2005: John D. McCalpin */ 12 /*-----------------------------------------------------------------------*/ 13 /* License: */ 14 /* 1. You are free to use this program and/or to redistribute */ 15 /* this program. */ 16 /* 2. You are free to modify this program for your own use, */ 17 /* including commercial use, subject to the publication */ 18 /* restrictions in item 3. */ 19 /* 3. You are free to publish results obtained from running this */ 20 /* program, or from works that you derive from this program, */ 21 /* with the following limitations: */ 22 /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 /* published results must be in conformance to the STREAM */ 24 /* Run Rules, (briefly reviewed below) published at */ 25 /* http://www.cs.virginia.edu/stream/ref.html */ 26 /* and incorporated herein by reference. */ 27 /* As the copyright holder, John McCalpin retains the */ 28 /* right to determine conformity with the Run Rules. */ 29 /* 3b. Results based on modified source code or on runs not in */ 30 /* accordance with the STREAM Run Rules must be clearly */ 31 /* labelled whenever they are published. Examples of */ 32 /* proper labelling include: */ 33 /* "tuned STREAM benchmark results" */ 34 /* "based on a variant of the STREAM benchmark code" */ 35 /* Other comparable, clear and reasonable labelling is */ 36 /* acceptable. */ 37 /* 3c. Submission of results to the STREAM benchmark web site */ 38 /* is encouraged, but not required. */ 39 /* 4. Use of this program or creation of derived works based on this */ 40 /* program constitutes acceptance of these licensing restrictions. */ 41 /* 5. Absolutely no warranty is expressed or implied. */ 42 /*-----------------------------------------------------------------------*/ 43 # include <stdio.h> 44 # include <math.h> 45 # include <float.h> 46 # include <limits.h> 47 # include <sys/time.h> 48 49 /* INSTRUCTIONS: 50 * 51 * 1) Stream requires a good bit of memory to run. Adjust the 52 * value of 'N' (below) to give a 'timing calibration' of 53 * at least 20 clock-ticks. This will provide rate estimates 54 * that should be good to about 5% precision. 55 */ 56 57 #ifndef N 58 # define N 2000000 59 #endif 60 #ifndef NTIMES 61 # define NTIMES 10 62 #endif 63 #ifndef OFFSET 64 # define OFFSET 0 65 #endif 66 67 /* 68 * 3) Compile the code with full optimization. Many compilers 69 * generate unreasonably bad code before the optimizer tightens 70 * things up. If the results are unreasonably good, on the 71 * other hand, the optimizer might be too smart for me! 72 * 73 * Try compiling with: 74 * cc -O stream_omp.c -o stream_omp 75 * 76 * This is known to work on Cray, SGI, IBM, and Sun machines. 77 * 78 * 79 * 4) Mail the results to mccalpin@cs.virginia.edu 80 * Be sure to include: 81 * a) computer hardware model number and software revision 82 * b) the compiler flags 83 * c) all of the output from the test case. 84 * Thanks! 85 * 86 */ 87 88 # define HLINE "-------------------------------------------------------------\n" 89 90 # ifndef MIN 91 # define MIN(x,y) ((x)<(y)?(x):(y)) 92 # endif 93 # ifndef MAX 94 # define MAX(x,y) ((x)>(y)?(x):(y)) 95 # endif 96 97 static double a[N+OFFSET], 98 b[N+OFFSET], 99 c[N+OFFSET]; 100 101 static double avgtime[4] = {0}, maxtime[4] = {0}, 102 mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 103 104 static char *label[4] = {"Copy: ", "Scale: ", 105 "Add: ", "Triad: "}; 106 107 static double bytes[4] = { 108 2 * sizeof(double) * N, 109 2 * sizeof(double) * N, 110 3 * sizeof(double) * N, 111 3 * sizeof(double) * N 112 }; 113 114 extern double mysecond(); 115 extern void checkSTREAMresults(); 116 #ifdef TUNED 117 extern void tuned_STREAM_Copy(); 118 extern void tuned_STREAM_Scale(double scalar); 119 extern void tuned_STREAM_Add(); 120 extern void tuned_STREAM_Triad(double scalar); 121 #endif 122 #ifdef _OPENMP 123 extern int omp_get_num_threads(); 124 #endif 125 int 126 main() 127 { 128 int quantum, checktick(); 129 int BytesPerWord; 130 register int j, k; 131 double scalar, t, times[4][NTIMES]; 132 133 /* --- SETUP --- determine precision and check timing --- */ 134 135 printf(HLINE); 136 printf("STREAM version $Revision: 5.9 $\n"); 137 printf(HLINE); 138 BytesPerWord = sizeof(double); 139 printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 140 BytesPerWord); 141 142 printf(HLINE); 143 #ifdef NO_LONG_LONG 144 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 145 #else 146 printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 147 #endif 148 149 printf("Total memory required = %.1f MB.\n", 150 (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); 151 printf("Each test is run %d times, but only\n", NTIMES); 152 printf("the *best* time for each is used.\n"); 153 154 #ifdef _OPENMP 155 printf(HLINE); 156 #pragma omp parallel 157 { 158 #pragma omp master 159 { 160 k = omp_get_num_threads(); 161 printf ("Number of Threads requested = %i\n",k); 162 } 163 } 164 #endif 165 166 printf(HLINE); 167 #pragma omp parallel 168 { 169 printf ("Printing one line per active thread....\n"); 170 } 171 172 /* Get initial value for system clock. */ 173 #pragma omp parallel for 174 for (j=0; j<N; j++) { 175 a[j] = 1.0; 176 b[j] = 2.0; 177 c[j] = 0.0; 178 } 179 180 printf(HLINE); 181 182 if ( (quantum = checktick()) >= 1) 183 printf("Your clock granularity/precision appears to be " 184 "%d microseconds.\n", quantum); 185 else { 186 printf("Your clock granularity appears to be " 187 "less than one microsecond.\n"); 188 quantum = 1; 189 } 190 191 t = mysecond(); 192 #pragma omp parallel for 193 for (j = 0; j < N; j++) 194 a[j] = 2.0E0 * a[j]; 195 t = 1.0E6 * (mysecond() - t); 196 197 printf("Each test below will take on the order" 198 " of %d microseconds.\n", (int) t ); 199 printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 200 printf("Increase the size of the arrays if this shows that\n"); 201 printf("you are not getting at least 20 clock ticks per test.\n"); 202 203 printf(HLINE); 204 205 printf("WARNING -- The above is only a rough guideline.\n"); 206 printf("For best results, please be sure you know the\n"); 207 printf("precision of your system timer.\n"); 208 printf(HLINE); 209 210 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 211 212 scalar = 3.0; 213 for (k=0; k<NTIMES; k++) 214 { 215 times[0][k] = mysecond(); 216 #ifdef TUNED 217 tuned_STREAM_Copy(); 218 #else 219 #pragma omp parallel for 220 for (j=0; j<N; j++) 221 c[j] = a[j]; 222 #endif 223 times[0][k] = mysecond() - times[0][k]; 224 225 times[1][k] = mysecond(); 226 #ifdef TUNED 227 tuned_STREAM_Scale(scalar); 228 #else 229 #pragma omp parallel for 230 for (j=0; j<N; j++) 231 b[j] = scalar*c[j]; 232 #endif 233 times[1][k] = mysecond() - times[1][k]; 234 235 times[2][k] = mysecond(); 236 #ifdef TUNED 237 tuned_STREAM_Add(); 238 #else 239 #pragma omp parallel for 240 for (j=0; j<N; j++) 241 c[j] = a[j]+b[j]; 242 #endif 243 times[2][k] = mysecond() - times[2][k]; 244 245 times[3][k] = mysecond(); 246 #ifdef TUNED 247 tuned_STREAM_Triad(scalar); 248 #else 249 #pragma omp parallel for 250 for (j=0; j<N; j++) 251 a[j] = b[j]+scalar*c[j]; 252 #endif 253 times[3][k] = mysecond() - times[3][k]; 254 } 255 256 /* --- SUMMARY --- */ 257 258 for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ 259 { 260 for (j=0; j<4; j++) 261 { 262 avgtime[j] = avgtime[j] + times[j][k]; 263 mintime[j] = MIN(mintime[j], times[j][k]); 264 maxtime[j] = MAX(maxtime[j], times[j][k]); 265 } 266 } 267 268 printf("Function Rate (MB/s) Avg time Min time Max time\n"); 269 for (j=0; j<4; j++) { 270 avgtime[j] = avgtime[j]/(double)(NTIMES-1); 271 272 printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 273 1.0E-06 * bytes[j]/mintime[j], 274 avgtime[j], 275 mintime[j], 276 maxtime[j]); 277 } 278 printf(HLINE); 279 280 /* --- Check Results --- */ 281 checkSTREAMresults(); 282 printf(HLINE); 283 284 return 0; 285 } 286 287 # define M 20 288 289 int 290 checktick() 291 { 292 int i, minDelta, Delta; 293 double t1, t2, timesfound[M]; 294 295 /* Collect a sequence of M unique time values from the system. */ 296 297 for (i = 0; i < M; i++) { 298 t1 = mysecond(); 299 while( ((t2=mysecond()) - t1) < 1.0E-6 ) 300 ; 301 timesfound[i] = t1 = t2; 302 } 303 304 /* 305 * Determine the minimum difference between these M values. 306 * This result will be our estimate (in microseconds) for the 307 * clock granularity. 308 */ 309 310 minDelta = 1000000; 311 for (i = 1; i < M; i++) { 312 Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 313 minDelta = MIN(minDelta, MAX(Delta,0)); 314 } 315 316 return(minDelta); 317 } 318 319 320 321 /* A gettimeofday routine to give access to the wall 322 clock timer on most UNIX-like systems. */ 323 324 #include <sys/time.h> 325 326 double mysecond() 327 { 328 struct timeval tp; 329 struct timezone tzp; 330 int i; 331 332 i = gettimeofday(&tp,&tzp); 333 return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 334 } 335 336 void checkSTREAMresults () 337 { 338 double aj,bj,cj,scalar; 339 double asum,bsum,csum; 340 double epsilon; 341 int j,k; 342 343 /* reproduce initialization */ 344 aj = 1.0; 345 bj = 2.0; 346 cj = 0.0; 347 /* a[] is modified during timing check */ 348 aj = 2.0E0 * aj; 349 /* now execute timing loop */ 350 scalar = 3.0; 351 for (k=0; k<NTIMES; k++) 352 { 353 cj = aj; 354 bj = scalar*cj; 355 cj = aj+bj; 356 aj = bj+scalar*cj; 357 } 358 aj = aj * (double) (N); 359 bj = bj * (double) (N); 360 cj = cj * (double) (N); 361 362 asum = 0.0; 363 bsum = 0.0; 364 csum = 0.0; 365 for (j=0; j<N; j++) { 366 asum += a[j]; 367 bsum += b[j]; 368 csum += c[j]; 369 } 370 #ifdef VERBOSE 371 printf ("Results Comparison: \n"); 372 printf (" Expected : %f %f %f \n",aj,bj,cj); 373 printf (" Observed : %f %f %f \n",asum,bsum,csum); 374 #endif 375 376 #ifndef abs 377 #define abs(a) ((a) >= 0 ? (a) : -(a)) 378 #endif 379 epsilon = 1.e-8; 380 381 if (abs(aj-asum)/asum > epsilon) { 382 printf ("Failed Validation on array a[]\n"); 383 printf (" Expected : %f \n",aj); 384 printf (" Observed : %f \n",asum); 385 } 386 else if (abs(bj-bsum)/bsum > epsilon) { 387 printf ("Failed Validation on array b[]\n"); 388 printf (" Expected : %f \n",bj); 389 printf (" Observed : %f \n",bsum); 390 } 391 else if (abs(cj-csum)/csum > epsilon) { 392 printf ("Failed Validation on array c[]\n"); 393 printf (" Expected : %f \n",cj); 394 printf (" Observed : %f \n",csum); 395 } 396 else { 397 printf ("Solution Validates\n"); 398 } 399 } 400 401 void tuned_STREAM_Copy() 402 { 403 int j; 404 #pragma omp parallel for 405 for (j=0; j<N; j++) 406 c[j] = a[j]; 407 } 408 409 void tuned_STREAM_Scale(double scalar) 410 { 411 int j; 412 #pragma omp parallel for 413 for (j=0; j<N; j++) 414 b[j] = scalar*c[j]; 415 } 416 417 void tuned_STREAM_Add() 418 { 419 int j; 420 #pragma omp parallel for 421 for (j=0; j<N; j++) 422 c[j] = a[j]+b[j]; 423 } 424 425 void tuned_STREAM_Triad(double scalar) 426 { 427 int j; 428 #pragma omp parallel for 429 for (j=0; j<N; j++) 430 a[j] = b[j]+scalar*c[j]; 431 } 432