1 /*-----------------------------------------------------------------------*/ 2 /* Program: Stream */ 3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 4 /* Original code developed by John D. McCalpin */ 5 /* Programmers: John D. McCalpin */ 6 /* Joe R. Zagar */ 7 /* */ 8 /* This program measures memory transfer rates in MB/s for simple */ 9 /* computational kernels coded in C. */ 10 /*-----------------------------------------------------------------------*/ 11 /* Copyright 1991-2005: John D. McCalpin */ 12 /*-----------------------------------------------------------------------*/ 13 /* License: */ 14 /* 1. You are free to use this program and/or to redistribute */ 15 /* this program. */ 16 /* 2. You are free to modify this program for your own use, */ 17 /* including commercial use, subject to the publication */ 18 /* restrictions in item 3. */ 19 /* 3. You are free to publish results obtained from running this */ 20 /* program, or from works that you derive from this program, */ 21 /* with the following limitations: */ 22 /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 /* published results must be in conformance to the STREAM */ 24 /* Run Rules, (briefly reviewed below) published at */ 25 /* http://www.cs.virginia.edu/stream/ref.html */ 26 /* and incorporated herein by reference. */ 27 /* As the copyright holder, John McCalpin retains the */ 28 /* right to determine conformity with the Run Rules. */ 29 /* 3b. Results based on modified source code or on runs not in */ 30 /* accordance with the STREAM Run Rules must be clearly */ 31 /* labelled whenever they are published. Examples of */ 32 /* proper labelling include: */ 33 /* "tuned STREAM benchmark results" */ 34 /* "based on a variant of the STREAM benchmark code" */ 35 /* Other comparable, clear and reasonable labelling is */ 36 /* acceptable. */ 37 /* 3c. Submission of results to the STREAM benchmark web site */ 38 /* is encouraged, but not required. */ 39 /* 4. Use of this program or creation of derived works based on this */ 40 /* program constitutes acceptance of these licensing restrictions. */ 41 /* 5. Absolutely no warranty is expressed or implied. */ 42 /*-----------------------------------------------------------------------*/ 43 # include <stdio.h> 44 # include <math.h> 45 # include <limits.h> 46 # include <sys/time.h> 47 48 /* INSTRUCTIONS: 49 * 50 * 1) Stream requires a good bit of memory to run. Adjust the 51 * value of 'N' (below) to give a 'timing calibration' of 52 * at least 20 clock-ticks. This will provide rate estimates 53 * that should be good to about 5% precision. 54 */ 55 56 #ifndef N 57 # define N 2000000 58 #endif 59 #ifndef NTIMES 60 # define NTIMES 10 61 #endif 62 #ifndef OFFSET 63 # define OFFSET 0 64 #endif 65 66 /* 67 * 3) Compile the code with full optimization. Many compilers 68 * generate unreasonably bad code before the optimizer tightens 69 * things up. If the results are unreasonably good, on the 70 * other hand, the optimizer might be too smart for me! 71 * 72 * Try compiling with: 73 * cc -O stream_omp.c -o stream_omp 74 * 75 * This is known to work on Cray, SGI, IBM, and Sun machines. 76 * 77 * 78 * 4) Mail the results to mccalpin@cs.virginia.edu 79 * Be sure to include: 80 * a) computer hardware model number and software revision 81 * b) the compiler flags 82 * c) all of the output from the test case. 83 * Thanks! 84 * 85 */ 86 87 # define HLINE "-------------------------------------------------------------\n" 88 89 # ifndef MIN 90 # define MIN(x,y) ((x)<(y)?(x):(y)) 91 # endif 92 # ifndef MAX 93 # define MAX(x,y) ((x)>(y)?(x):(y)) 94 # endif 95 96 static double a[N+OFFSET], 97 b[N+OFFSET], 98 c[N+OFFSET]; 99 100 static double avgtime[4] = {0}, maxtime[4] = {0}, 101 mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 102 103 static const char *label[4] = {"Copy: ", "Scale: ","Add: ", "Triad: "}; 104 105 static double bytes[4] = { 106 2 * sizeof(double) * N, 107 2 * sizeof(double) * N, 108 3 * sizeof(double) * N, 109 3 * sizeof(double) * N 110 }; 111 112 extern double mysecond(); 113 extern void checkSTREAMresults(); 114 #ifdef TUNED 115 extern void tuned_STREAM_Copy(); 116 extern void tuned_STREAM_Scale(double scalar); 117 extern void tuned_STREAM_Add(); 118 extern void tuned_STREAM_Triad(double scalar); 119 #endif 120 extern int omp_get_num_threads(); 121 int main() 122 { 123 int quantum, checktick(); 124 int BytesPerWord; 125 register int j, k; 126 double scalar, t, times[4][NTIMES]; 127 128 /* --- SETUP --- determine precision and check timing --- */ 129 130 /*printf(HLINE); 131 printf("STREAM version $Revision: 5.9 $\n"); 132 printf(HLINE); */ 133 BytesPerWord = sizeof(double); 134 /* printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 135 BytesPerWord); 136 137 printf(HLINE); 138 #ifdef NO_LONG_LONG 139 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 140 #else 141 printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 142 #endif 143 144 printf("Total memory required = %.1f MB.\n", 145 (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); 146 printf("Each test is run %d times, but only\n", NTIMES); 147 printf("the *best* time for each is used.\n"); 148 149 printf(HLINE); */ 150 #pragma omp parallel 151 { 152 #pragma omp master 153 { 154 k = omp_get_num_threads(); 155 printf(HLINE); 156 printf ("Number of OpenMP Threads requested = %i\n",k); 157 } 158 } 159 160 161 162 /* Get initial value for system clock. */ 163 #pragma omp parallel for 164 for (j=0; j<N; j++) { 165 a[j] = 1.0; 166 b[j] = 2.0; 167 c[j] = 0.0; 168 } 169 170 /*printf(HLINE);*/ 171 172 if ( (quantum = checktick()) >= 1) 173 ;/* printf("Your clock granularity/precision appears to be " 174 "%d microseconds.\n", quantum);*/ 175 else { 176 ;/* printf("Your clock granularity appears to be " 177 "less than one microsecond.\n");*/ 178 quantum = 1; 179 } 180 181 t = mysecond(); 182 #pragma omp parallel for 183 for (j = 0; j < N; j++) 184 a[j] = 2.0E0 * a[j]; 185 t = 1.0E6 * (mysecond() - t); 186 187 /*printf("Each test below will take on the order" 188 " of %d microseconds.\n", (int) t ); 189 printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 190 printf("Increase the size of the arrays if this shows that\n"); 191 printf("you are not getting at least 20 clock ticks per test.\n"); 192 193 printf(HLINE);*/ 194 195 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 196 197 scalar = 3.0; 198 for (k=0; k<NTIMES; k++) 199 { 200 times[0][k] = mysecond(); 201 #ifdef TUNED 202 tuned_STREAM_Copy(); 203 #else 204 #pragma omp parallel for 205 for (j=0; j<N; j++) 206 c[j] = a[j]; 207 #endif 208 times[0][k] = mysecond() - times[0][k]; 209 210 times[1][k] = mysecond(); 211 #ifdef TUNED 212 tuned_STREAM_Scale(scalar); 213 #else 214 #pragma omp parallel for 215 for (j=0; j<N; j++) 216 b[j] = scalar*c[j]; 217 #endif 218 times[1][k] = mysecond() - times[1][k]; 219 220 times[2][k] = mysecond(); 221 #ifdef TUNED 222 tuned_STREAM_Add(); 223 #else 224 #pragma omp parallel for 225 for (j=0; j<N; j++) 226 c[j] = a[j]+b[j]; 227 #endif 228 times[2][k] = mysecond() - times[2][k]; 229 230 times[3][k] = mysecond(); 231 #ifdef TUNED 232 tuned_STREAM_Triad(scalar); 233 #else 234 #pragma omp parallel for 235 for (j=0; j<N; j++) 236 a[j] = b[j]+scalar*c[j]; 237 #endif 238 times[3][k] = mysecond() - times[3][k]; 239 } 240 241 /* --- SUMMARY --- */ 242 243 for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ 244 { 245 for (j=0; j<4; j++) 246 { 247 avgtime[j] = avgtime[j] + times[j][k]; 248 mintime[j] = MIN(mintime[j], times[j][k]); 249 maxtime[j] = MAX(maxtime[j], times[j][k]); 250 } 251 } 252 253 printf("Function Rate (MB/s) \n"); 254 for (j=0; j<4; j++) { 255 avgtime[j] = avgtime[j]/(double)(NTIMES-1); 256 257 printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); 258 } 259 /* printf(HLINE);*/ 260 261 /* --- Check Results --- */ 262 checkSTREAMresults(); 263 /* printf(HLINE);*/ 264 265 return 0; 266 } 267 268 # define M 20 269 270 int 271 checktick() 272 { 273 int i, minDelta, Delta; 274 double t1, t2, timesfound[M]; 275 276 /* Collect a sequence of M unique time values from the system. */ 277 278 for (i = 0; i < M; i++) { 279 t1 = mysecond(); 280 while( ((t2=mysecond()) - t1) < 1.0E-6 ) 281 ; 282 timesfound[i] = t1 = t2; 283 } 284 285 /* 286 * Determine the minimum difference between these M values. 287 * This result will be our estimate (in microseconds) for the 288 * clock granularity. 289 */ 290 291 minDelta = 1000000; 292 for (i = 1; i < M; i++) { 293 Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 294 minDelta = MIN(minDelta, MAX(Delta,0)); 295 } 296 297 return(minDelta); 298 } 299 300 301 302 /* A gettimeofday routine to give access to the wall 303 clock timer on most UNIX-like systems. */ 304 305 #include <sys/time.h> 306 307 double mysecond() 308 { 309 struct timeval tp; 310 struct timezone tzp; 311 int i; 312 313 i = gettimeofday(&tp,&tzp); 314 return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 315 } 316 317 void checkSTREAMresults () 318 { 319 double aj,bj,cj,scalar; 320 double asum,bsum,csum; 321 double epsilon; 322 int j,k; 323 324 /* reproduce initialization */ 325 aj = 1.0; 326 bj = 2.0; 327 cj = 0.0; 328 /* a[] is modified during timing check */ 329 aj = 2.0E0 * aj; 330 /* now execute timing loop */ 331 scalar = 3.0; 332 for (k=0; k<NTIMES; k++) 333 { 334 cj = aj; 335 bj = scalar*cj; 336 cj = aj+bj; 337 aj = bj+scalar*cj; 338 } 339 aj = aj * (double) (N); 340 bj = bj * (double) (N); 341 cj = cj * (double) (N); 342 343 asum = 0.0; 344 bsum = 0.0; 345 csum = 0.0; 346 for (j=0; j<N; j++) { 347 asum += a[j]; 348 bsum += b[j]; 349 csum += c[j]; 350 } 351 #ifdef VERBOSE 352 printf ("Results Comparison: \n"); 353 printf (" Expected : %f %f %f \n",aj,bj,cj); 354 printf (" Observed : %f %f %f \n",asum,bsum,csum); 355 #endif 356 357 #ifndef abs 358 #define abs(a) ((a) >= 0 ? (a) : -(a)) 359 #endif 360 epsilon = 1.e-8; 361 362 if (abs(aj-asum)/asum > epsilon) { 363 printf ("Failed Validation on array a[]\n"); 364 printf (" Expected : %f \n",aj); 365 printf (" Observed : %f \n",asum); 366 } 367 else if (abs(bj-bsum)/bsum > epsilon) { 368 printf ("Failed Validation on array b[]\n"); 369 printf (" Expected : %f \n",bj); 370 printf (" Observed : %f \n",bsum); 371 } 372 else if (abs(cj-csum)/csum > epsilon) { 373 printf ("Failed Validation on array c[]\n"); 374 printf (" Expected : %f \n",cj); 375 printf (" Observed : %f \n",csum); 376 } 377 else { 378 ;/* printf ("Solution Validates\n"); */ 379 } 380 } 381 382 void tuned_STREAM_Copy() 383 { 384 int j; 385 #pragma omp parallel for 386 for (j=0; j<N; j++) 387 c[j] = a[j]; 388 } 389 390 void tuned_STREAM_Scale(double scalar) 391 { 392 int j; 393 #pragma omp parallel for 394 for (j=0; j<N; j++) 395 b[j] = scalar*c[j]; 396 } 397 398 void tuned_STREAM_Add() 399 { 400 int j; 401 #pragma omp parallel for 402 for (j=0; j<N; j++) 403 c[j] = a[j]+b[j]; 404 } 405 406 void tuned_STREAM_Triad(double scalar) 407 { 408 int j; 409 #pragma omp parallel for 410 for (j=0; j<N; j++) 411 a[j] = b[j]+scalar*c[j]; 412 } 413