1 /*-----------------------------------------------------------------------*/ 2 /* Program: Stream */ 3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 4 /* Original code developed by John D. McCalpin */ 5 /* Programmers: John D. McCalpin */ 6 /* Joe R. Zagar */ 7 /* */ 8 /* This program measures memory transfer rates in MB/s for simple */ 9 /* computational kernels coded in C. */ 10 /*-----------------------------------------------------------------------*/ 11 /* Copyright 1991-2005: John D. McCalpin */ 12 /*-----------------------------------------------------------------------*/ 13 /* License: */ 14 /* 1. You are free to use this program and/or to redistribute */ 15 /* this program. */ 16 /* 2. You are free to modify this program for your own use, */ 17 /* including commercial use, subject to the publication */ 18 /* restrictions in item 3. */ 19 /* 3. You are free to publish results obtained from running this */ 20 /* program, or from works that you derive from this program, */ 21 /* with the following limitations: */ 22 /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 /* published results must be in conformance to the STREAM */ 24 /* Run Rules, (briefly reviewed below) published at */ 25 /* http://www.cs.virginia.edu/stream/ref.html */ 26 /* and incorporated herein by reference. */ 27 /* As the copyright holder, John McCalpin retains the */ 28 /* right to determine conformity with the Run Rules. */ 29 /* 3b. Results based on modified source code or on runs not in */ 30 /* accordance with the STREAM Run Rules must be clearly */ 31 /* labelled whenever they are published. Examples of */ 32 /* proper labelling include: */ 33 /* "tuned STREAM benchmark results" */ 34 /* "based on a variant of the STREAM benchmark code" */ 35 /* Other comparable, clear and reasonable labelling is */ 36 /* acceptable. */ 37 /* 3c. Submission of results to the STREAM benchmark web site */ 38 /* is encouraged, but not required. */ 39 /* 4. Use of this program or creation of derived works based on this */ 40 /* program constitutes acceptance of these licensing restrictions. */ 41 /* 5. Absolutely no warranty is expressed or implied. */ 42 /*-----------------------------------------------------------------------*/ 43 # include <stdio.h> 44 # include <math.h> 45 # include <limits.h> 46 # include <float.h> 47 # include <sys/time.h> 48 49 /* INSTRUCTIONS: 50 * 51 * 1) Stream requires a good bit of memory to run. Adjust the 52 * value of 'N' (below) to give a 'timing calibration' of 53 * at least 20 clock-ticks. This will provide rate estimates 54 * that should be good to about 5% precision. 55 */ 56 57 #if !defined(N) 58 # define N 2000000 59 #endif 60 #if !defined(NTIMES) 61 # define NTIMES 50 62 #endif 63 #if !defined(OFFSET) 64 # define OFFSET 0 65 #endif 66 67 /* 68 * 3) Compile the code with full optimization. Many compilers 69 * generate unreasonably bad code before the optimizer tightens 70 * things up. If the results are unreasonably good, on the 71 * other hand, the optimizer might be too smart for me! 72 * 73 * Try compiling with: 74 * cc -O stream_omp.c -o stream_omp 75 * 76 * This is known to work on Cray, SGI, IBM, and Sun machines. 77 * 78 * 79 * 4) Mail the results to mccalpin@cs.virginia.edu 80 * Be sure to include: 81 * a) computer hardware model number and software revision 82 * b) the compiler flags 83 * c) all of the output from the test case. 84 * Thanks! 85 * 86 */ 87 88 # define HLINE "-------------------------------------------------------------\n" 89 90 # if !defined(MIN) 91 # define MIN(x,y) ((x)<(y)?(x):(y)) 92 # endif 93 # if !defined(MAX) 94 # define MAX(x,y) ((x)>(y)?(x):(y)) 95 # endif 96 97 static double a[N+OFFSET], 98 b[N+OFFSET], 99 c[N+OFFSET]; 100 101 static double avgtime[4] = {0}, maxtime[4] = {0}, 102 mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 103 104 static const char *label[4] = {"Copy: ", "Scale: ","Add: ", "Triad: "}; 105 106 static double bytes[4] = { 107 2 * sizeof(double) * N, 108 2 * sizeof(double) * N, 109 3 * sizeof(double) * N, 110 3 * sizeof(double) * N 111 }; 112 113 extern double mysecond(); 114 extern void checkSTREAMresults(); 115 #if defined(TUNED) 116 extern void tuned_STREAM_Copy(); 117 extern void tuned_STREAM_Scale(double scalar); 118 extern void tuned_STREAM_Add(); 119 extern void tuned_STREAM_Triad(double scalar); 120 #endif 121 extern int omp_get_num_threads(); 122 int main() 123 { 124 int quantum, checktick(); 125 int BytesPerWord; 126 register int j, k; 127 double scalar, t, times[4][NTIMES]; 128 129 /* --- SETUP --- determine precision and check timing --- */ 130 131 /*printf(HLINE); 132 printf("STREAM version $Revision: 5.9 $\n"); 133 printf(HLINE); */ 134 BytesPerWord = sizeof(double); 135 /* printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 136 BytesPerWord); 137 138 printf(HLINE); 139 #if defined(NO_LONG_LONG) 140 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 141 #else 142 printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 143 #endif 144 145 printf("Total memory required = %.1f MB.\n", 146 (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); 147 printf("Each test is run %d times, but only\n", NTIMES); 148 printf("the *best* time for each is used.\n"); 149 150 printf(HLINE); */ 151 #pragma omp parallel 152 { 153 #pragma omp master 154 { 155 k = omp_get_num_threads(); 156 printf(HLINE); 157 printf ("Number of OpenMP Threads requested = %i\n",k); 158 } 159 } 160 161 162 163 /* Get initial value for system clock. */ 164 #pragma omp parallel for 165 for (j=0; j<N; j++) { 166 a[j] = 1.0; 167 b[j] = 2.0; 168 c[j] = 0.0; 169 } 170 171 /*printf(HLINE);*/ 172 173 if ( (quantum = checktick()) >= 1) 174 ;/* printf("Your clock granularity/precision appears to be " 175 "%d microseconds.\n", quantum);*/ 176 else { 177 ;/* printf("Your clock granularity appears to be " 178 "less than one microsecond.\n");*/ 179 quantum = 1; 180 } 181 182 t = mysecond(); 183 #pragma omp parallel for 184 for (j = 0; j < N; j++) 185 a[j] = 2.0E0 * a[j]; 186 t = 1.0E6 * (mysecond() - t); 187 188 /*printf("Each test below will take on the order" 189 " of %d microseconds.\n", (int) t ); 190 printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 191 printf("Increase the size of the arrays if this shows that\n"); 192 printf("you are not getting at least 20 clock ticks per test.\n"); 193 194 printf(HLINE);*/ 195 196 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 197 198 scalar = 3.0; 199 for (k=0; k<NTIMES; k++) 200 { 201 times[0][k] = mysecond(); 202 #if defined(TUNED) 203 tuned_STREAM_Copy(); 204 #else 205 #pragma omp parallel for 206 for (j=0; j<N; j++) 207 c[j] = a[j]; 208 #endif 209 times[0][k] = mysecond() - times[0][k]; 210 211 times[1][k] = mysecond(); 212 #if defined(TUNED) 213 tuned_STREAM_Scale(scalar); 214 #else 215 #pragma omp parallel for 216 for (j=0; j<N; j++) 217 b[j] = scalar*c[j]; 218 #endif 219 times[1][k] = mysecond() - times[1][k]; 220 221 times[2][k] = mysecond(); 222 #if defined(TUNED) 223 tuned_STREAM_Add(); 224 #else 225 #pragma omp parallel for 226 for (j=0; j<N; j++) 227 c[j] = a[j]+b[j]; 228 #endif 229 times[2][k] = mysecond() - times[2][k]; 230 231 times[3][k] = mysecond(); 232 #if defined(TUNED) 233 tuned_STREAM_Triad(scalar); 234 #else 235 #pragma omp parallel for 236 for (j=0; j<N; j++) 237 a[j] = b[j]+scalar*c[j]; 238 #endif 239 times[3][k] = mysecond() - times[3][k]; 240 } 241 242 /* --- SUMMARY --- */ 243 244 for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ 245 { 246 for (j=0; j<4; j++) 247 { 248 avgtime[j] = avgtime[j] + times[j][k]; 249 mintime[j] = MIN(mintime[j], times[j][k]); 250 maxtime[j] = MAX(maxtime[j], times[j][k]); 251 } 252 } 253 254 printf("Function Rate (MB/s) \n"); 255 for (j=0; j<4; j++) { 256 avgtime[j] = avgtime[j]/(double)(NTIMES-1); 257 258 printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); 259 } 260 /* printf(HLINE);*/ 261 262 /* --- Check Results --- */ 263 checkSTREAMresults(); 264 /* printf(HLINE);*/ 265 266 return 0; 267 } 268 269 # define M 20 270 271 int 272 checktick() 273 { 274 int i, minDelta, Delta; 275 double t1, t2, timesfound[M]; 276 277 /* Collect a sequence of M unique time values from the system. */ 278 279 for (i = 0; i < M; i++) { 280 t1 = mysecond(); 281 while( ((t2=mysecond()) - t1) < 1.0E-6 ) 282 ; 283 timesfound[i] = t1 = t2; 284 } 285 286 /* 287 * Determine the minimum difference between these M values. 288 * This result will be our estimate (in microseconds) for the 289 * clock granularity. 290 */ 291 292 minDelta = 1000000; 293 for (i = 1; i < M; i++) { 294 Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 295 minDelta = MIN(minDelta, MAX(Delta,0)); 296 } 297 298 return(minDelta); 299 } 300 301 302 303 /* A gettimeofday routine to give access to the wall 304 clock timer on most UNIX-like systems. */ 305 306 #include <sys/time.h> 307 308 double mysecond() 309 { 310 struct timeval tp; 311 struct timezone tzp; 312 int i; 313 314 i = gettimeofday(&tp,&tzp); 315 return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 316 } 317 318 void checkSTREAMresults () 319 { 320 double aj,bj,cj,scalar; 321 double asum,bsum,csum; 322 double epsilon; 323 int j,k; 324 325 /* reproduce initialization */ 326 aj = 1.0; 327 bj = 2.0; 328 cj = 0.0; 329 /* a[] is modified during timing check */ 330 aj = 2.0E0 * aj; 331 /* now execute timing loop */ 332 scalar = 3.0; 333 for (k=0; k<NTIMES; k++) 334 { 335 cj = aj; 336 bj = scalar*cj; 337 cj = aj+bj; 338 aj = bj+scalar*cj; 339 } 340 aj = aj * (double) (N); 341 bj = bj * (double) (N); 342 cj = cj * (double) (N); 343 344 asum = 0.0; 345 bsum = 0.0; 346 csum = 0.0; 347 for (j=0; j<N; j++) { 348 asum += a[j]; 349 bsum += b[j]; 350 csum += c[j]; 351 } 352 #if defined(VERBOSE) 353 printf ("Results Comparison: \n"); 354 printf (" Expected : %f %f %f \n",aj,bj,cj); 355 printf (" Observed : %f %f %f \n",asum,bsum,csum); 356 #endif 357 358 #if !defined(abs) 359 #define abs(a) ((a) >= 0 ? (a) : -(a)) 360 #endif 361 epsilon = 1.e-8; 362 363 if (abs(aj-asum)/asum > epsilon) { 364 printf ("Failed Validation on array a[]\n"); 365 printf (" Expected : %f \n",aj); 366 printf (" Observed : %f \n",asum); 367 } 368 else if (abs(bj-bsum)/bsum > epsilon) { 369 printf ("Failed Validation on array b[]\n"); 370 printf (" Expected : %f \n",bj); 371 printf (" Observed : %f \n",bsum); 372 } 373 else if (abs(cj-csum)/csum > epsilon) { 374 printf ("Failed Validation on array c[]\n"); 375 printf (" Expected : %f \n",cj); 376 printf (" Observed : %f \n",csum); 377 } 378 else { 379 ;/* printf ("Solution Validates\n"); */ 380 } 381 } 382 383 void tuned_STREAM_Copy() 384 { 385 int j; 386 #pragma omp parallel for 387 for (j=0; j<N; j++) 388 c[j] = a[j]; 389 } 390 391 void tuned_STREAM_Scale(double scalar) 392 { 393 int j; 394 #pragma omp parallel for 395 for (j=0; j<N; j++) 396 b[j] = scalar*c[j]; 397 } 398 399 void tuned_STREAM_Add() 400 { 401 int j; 402 #pragma omp parallel for 403 for (j=0; j<N; j++) 404 c[j] = a[j]+b[j]; 405 } 406 407 void tuned_STREAM_Triad(double scalar) 408 { 409 int j; 410 #pragma omp parallel for 411 for (j=0; j<N; j++) 412 a[j] = b[j]+scalar*c[j]; 413 } 414