1 /*-----------------------------------------------------------------------*/ 2 /* Program: Stream */ 3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 4 /* Original code developed by John D. McCalpin */ 5 /* Programmers: John D. McCalpin */ 6 /* Joe R. Zagar */ 7 /* */ 8 /* This program measures memory transfer rates in MB/s for simple */ 9 /* computational kernels coded in C. */ 10 /*-----------------------------------------------------------------------*/ 11 /* Copyright 1991-2005: John D. McCalpin */ 12 /*-----------------------------------------------------------------------*/ 13 /* License: */ 14 /* 1. You are free to use this program and/or to redistribute */ 15 /* this program. */ 16 /* 2. You are free to modify this program for your own use, */ 17 /* including commercial use, subject to the publication */ 18 /* restrictions in item 3. */ 19 /* 3. You are free to publish results obtained from running this */ 20 /* program, or from works that you derive from this program, */ 21 /* with the following limitations: */ 22 /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 /* published results must be in conformance to the STREAM */ 24 /* Run Rules, (briefly reviewed below) published at */ 25 /* http://www.cs.virginia.edu/stream/ref.html */ 26 /* and incorporated herein by reference. */ 27 /* As the copyright holder, John McCalpin retains the */ 28 /* right to determine conformity with the Run Rules. */ 29 /* 3b. Results based on modified source code or on runs not in */ 30 /* accordance with the STREAM Run Rules must be clearly */ 31 /* labelled whenever they are published. Examples of */ 32 /* proper labelling include: */ 33 /* "tuned STREAM benchmark results" */ 34 /* "based on a variant of the STREAM benchmark code" */ 35 /* Other comparable, clear and reasonable labelling is */ 36 /* acceptable. */ 37 /* 3c. Submission of results to the STREAM benchmark web site */ 38 /* is encouraged, but not required. */ 39 /* 4. Use of this program or creation of derived works based on this */ 40 /* program constitutes acceptance of these licensing restrictions. */ 41 /* 5. Absolutely no warranty is expressed or implied. */ 42 /*-----------------------------------------------------------------------*/ 43 # include <stdio.h> 44 # include <math.h> 45 # include <float.h> 46 # include <limits.h> 47 # include <sys/time.h> 48 49 /* INSTRUCTIONS: 50 * 51 * 1) Stream requires a good bit of memory to run. Adjust the 52 * value of 'N' (below) to give a 'timing calibration' of 53 * at least 20 clock-ticks. This will provide rate estimates 54 * that should be good to about 5% precision. 55 */ 56 57 #ifndef N 58 # define N 2000000 59 #endif 60 #ifndef NTIMES 61 # define NTIMES 10 62 #endif 63 #ifndef OFFSET 64 # define OFFSET 0 65 #endif 66 67 /* 68 * 3) Compile the code with full optimization. Many compilers 69 * generate unreasonably bad code before the optimizer tightens 70 * things up. If the results are unreasonably good, on the 71 * other hand, the optimizer might be too smart for me! 72 * 73 * Try compiling with: 74 * cc -O stream_omp.c -o stream_omp 75 * 76 * This is known to work on Cray, SGI, IBM, and Sun machines. 77 * 78 * 79 * 4) Mail the results to mccalpin@cs.virginia.edu 80 * Be sure to include: 81 * a) computer hardware model number and software revision 82 * b) the compiler flags 83 * c) all of the output from the test case. 84 * Thanks! 85 * 86 */ 87 88 # define HLINE "-------------------------------------------------------------\n" 89 90 # ifndef MIN 91 # define MIN(x,y) ((x)<(y)?(x):(y)) 92 # endif 93 # ifndef MAX 94 # define MAX(x,y) ((x)>(y)?(x):(y)) 95 # endif 96 97 static double a[N+OFFSET], 98 b[N+OFFSET], 99 c[N+OFFSET]; 100 101 static double avgtime[4] = {0}, maxtime[4] = {0}, 102 mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 103 104 static char *label[4] = {"Copy: ", "Scale: ", 105 "Add: ", "Triad: "}; 106 107 static double bytes[4] = { 108 2 * sizeof(double) * N, 109 2 * sizeof(double) * N, 110 3 * sizeof(double) * N, 111 3 * sizeof(double) * N 112 }; 113 114 extern double mysecond(); 115 extern void checkSTREAMresults(); 116 #ifdef TUNED 117 extern void tuned_STREAM_Copy(); 118 extern void tuned_STREAM_Scale(double scalar); 119 extern void tuned_STREAM_Add(); 120 extern void tuned_STREAM_Triad(double scalar); 121 #endif 122 extern int omp_get_num_threads(); 123 int main() 124 { 125 int quantum, checktick(); 126 int BytesPerWord; 127 register int j, k; 128 double scalar, t, times[4][NTIMES]; 129 130 /* --- SETUP --- determine precision and check timing --- */ 131 132 printf(HLINE); 133 printf("STREAM version $Revision: 5.9 $\n"); 134 printf(HLINE); 135 BytesPerWord = sizeof(double); 136 printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 137 BytesPerWord); 138 139 printf(HLINE); 140 #ifdef NO_LONG_LONG 141 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 142 #else 143 printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 144 #endif 145 146 printf("Total memory required = %.1f MB.\n", 147 (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); 148 printf("Each test is run %d times, but only\n", NTIMES); 149 printf("the *best* time for each is used.\n"); 150 151 printf(HLINE); 152 #pragma omp parallel 153 { 154 #pragma omp master 155 { 156 k = omp_get_num_threads(); 157 printf ("Number of Threads requested = %i\n",k); 158 } 159 } 160 161 printf(HLINE); 162 #pragma omp parallel 163 { 164 printf ("Printing one line per active thread....\n"); 165 } 166 167 /* Get initial value for system clock. */ 168 #pragma omp parallel for 169 for (j=0; j<N; j++) { 170 a[j] = 1.0; 171 b[j] = 2.0; 172 c[j] = 0.0; 173 } 174 175 printf(HLINE); 176 177 if ( (quantum = checktick()) >= 1) 178 printf("Your clock granularity/precision appears to be " 179 "%d microseconds.\n", quantum); 180 else { 181 printf("Your clock granularity appears to be " 182 "less than one microsecond.\n"); 183 quantum = 1; 184 } 185 186 t = mysecond(); 187 #pragma omp parallel for 188 for (j = 0; j < N; j++) 189 a[j] = 2.0E0 * a[j]; 190 t = 1.0E6 * (mysecond() - t); 191 192 printf("Each test below will take on the order" 193 " of %d microseconds.\n", (int) t ); 194 printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 195 printf("Increase the size of the arrays if this shows that\n"); 196 printf("you are not getting at least 20 clock ticks per test.\n"); 197 198 printf(HLINE); 199 200 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 201 202 scalar = 3.0; 203 for (k=0; k<NTIMES; k++) 204 { 205 times[0][k] = mysecond(); 206 #ifdef TUNED 207 tuned_STREAM_Copy(); 208 #else 209 #pragma omp parallel for 210 for (j=0; j<N; j++) 211 c[j] = a[j]; 212 #endif 213 times[0][k] = mysecond() - times[0][k]; 214 215 times[1][k] = mysecond(); 216 #ifdef TUNED 217 tuned_STREAM_Scale(scalar); 218 #else 219 #pragma omp parallel for 220 for (j=0; j<N; j++) 221 b[j] = scalar*c[j]; 222 #endif 223 times[1][k] = mysecond() - times[1][k]; 224 225 times[2][k] = mysecond(); 226 #ifdef TUNED 227 tuned_STREAM_Add(); 228 #else 229 #pragma omp parallel for 230 for (j=0; j<N; j++) 231 c[j] = a[j]+b[j]; 232 #endif 233 times[2][k] = mysecond() - times[2][k]; 234 235 times[3][k] = mysecond(); 236 #ifdef TUNED 237 tuned_STREAM_Triad(scalar); 238 #else 239 #pragma omp parallel for 240 for (j=0; j<N; j++) 241 a[j] = b[j]+scalar*c[j]; 242 #endif 243 times[3][k] = mysecond() - times[3][k]; 244 } 245 246 /* --- SUMMARY --- */ 247 248 for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ 249 { 250 for (j=0; j<4; j++) 251 { 252 avgtime[j] = avgtime[j] + times[j][k]; 253 mintime[j] = MIN(mintime[j], times[j][k]); 254 maxtime[j] = MAX(maxtime[j], times[j][k]); 255 } 256 } 257 258 printf("Function Rate (MB/s) Avg time Min time Max time\n"); 259 for (j=0; j<4; j++) { 260 avgtime[j] = avgtime[j]/(double)(NTIMES-1); 261 262 printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 263 1.0E-06 * bytes[j]/mintime[j], 264 avgtime[j], 265 mintime[j], 266 maxtime[j]); 267 } 268 printf(HLINE); 269 270 /* --- Check Results --- */ 271 checkSTREAMresults(); 272 printf(HLINE); 273 274 return 0; 275 } 276 277 # define M 20 278 279 int 280 checktick() 281 { 282 int i, minDelta, Delta; 283 double t1, t2, timesfound[M]; 284 285 /* Collect a sequence of M unique time values from the system. */ 286 287 for (i = 0; i < M; i++) { 288 t1 = mysecond(); 289 while( ((t2=mysecond()) - t1) < 1.0E-6 ) 290 ; 291 timesfound[i] = t1 = t2; 292 } 293 294 /* 295 * Determine the minimum difference between these M values. 296 * This result will be our estimate (in microseconds) for the 297 * clock granularity. 298 */ 299 300 minDelta = 1000000; 301 for (i = 1; i < M; i++) { 302 Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 303 minDelta = MIN(minDelta, MAX(Delta,0)); 304 } 305 306 return(minDelta); 307 } 308 309 310 311 /* A gettimeofday routine to give access to the wall 312 clock timer on most UNIX-like systems. */ 313 314 #include <sys/time.h> 315 316 double mysecond() 317 { 318 struct timeval tp; 319 struct timezone tzp; 320 int i; 321 322 i = gettimeofday(&tp,&tzp); 323 return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 324 } 325 326 void checkSTREAMresults () 327 { 328 double aj,bj,cj,scalar; 329 double asum,bsum,csum; 330 double epsilon; 331 int j,k; 332 333 /* reproduce initialization */ 334 aj = 1.0; 335 bj = 2.0; 336 cj = 0.0; 337 /* a[] is modified during timing check */ 338 aj = 2.0E0 * aj; 339 /* now execute timing loop */ 340 scalar = 3.0; 341 for (k=0; k<NTIMES; k++) 342 { 343 cj = aj; 344 bj = scalar*cj; 345 cj = aj+bj; 346 aj = bj+scalar*cj; 347 } 348 aj = aj * (double) (N); 349 bj = bj * (double) (N); 350 cj = cj * (double) (N); 351 352 asum = 0.0; 353 bsum = 0.0; 354 csum = 0.0; 355 for (j=0; j<N; j++) { 356 asum += a[j]; 357 bsum += b[j]; 358 csum += c[j]; 359 } 360 #ifdef VERBOSE 361 printf ("Results Comparison: \n"); 362 printf (" Expected : %f %f %f \n",aj,bj,cj); 363 printf (" Observed : %f %f %f \n",asum,bsum,csum); 364 #endif 365 366 #ifndef abs 367 #define abs(a) ((a) >= 0 ? (a) : -(a)) 368 #endif 369 epsilon = 1.e-8; 370 371 if (abs(aj-asum)/asum > epsilon) { 372 printf ("Failed Validation on array a[]\n"); 373 printf (" Expected : %f \n",aj); 374 printf (" Observed : %f \n",asum); 375 } 376 else if (abs(bj-bsum)/bsum > epsilon) { 377 printf ("Failed Validation on array b[]\n"); 378 printf (" Expected : %f \n",bj); 379 printf (" Observed : %f \n",bsum); 380 } 381 else if (abs(cj-csum)/csum > epsilon) { 382 printf ("Failed Validation on array c[]\n"); 383 printf (" Expected : %f \n",cj); 384 printf (" Observed : %f \n",csum); 385 } 386 else { 387 printf ("Solution Validates\n"); 388 } 389 } 390 391 void tuned_STREAM_Copy() 392 { 393 int j; 394 #pragma omp parallel for 395 for (j=0; j<N; j++) 396 c[j] = a[j]; 397 } 398 399 void tuned_STREAM_Scale(double scalar) 400 { 401 int j; 402 #pragma omp parallel for 403 for (j=0; j<N; j++) 404 b[j] = scalar*c[j]; 405 } 406 407 void tuned_STREAM_Add() 408 { 409 int j; 410 #pragma omp parallel for 411 for (j=0; j<N; j++) 412 c[j] = a[j]+b[j]; 413 } 414 415 void tuned_STREAM_Triad(double scalar) 416 { 417 int j; 418 #pragma omp parallel for 419 for (j=0; j<N; j++) 420 a[j] = b[j]+scalar*c[j]; 421 } 422