1 /*-----------------------------------------------------------------------*/ 2 /* Program: Stream */ 3 /* Revision: $Id: stream.c,v 5.9 2009/04/11 16:35:00 mccalpin Exp mccalpin $ */ 4 /* Original code developed by John D. McCalpin */ 5 /* Programmers: John D. McCalpin */ 6 /* Joe R. Zagar */ 7 /* */ 8 /* This program measures memory transfer rates in MB/s for simple */ 9 /* computational kernels coded in C. */ 10 /*-----------------------------------------------------------------------*/ 11 /* Copyright 1991-2005: John D. McCalpin */ 12 /*-----------------------------------------------------------------------*/ 13 /* License: */ 14 /* 1. You are free to use this program and/or to redistribute */ 15 /* this program. */ 16 /* 2. You are free to modify this program for your own use, */ 17 /* including commercial use, subject to the publication */ 18 /* restrictions in item 3. */ 19 /* 3. You are free to publish results obtained from running this */ 20 /* program, or from works that you derive from this program, */ 21 /* with the following limitations: */ 22 /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 /* published results must be in conformance to the STREAM */ 24 /* Run Rules, (briefly reviewed below) published at */ 25 /* http://www.cs.virginia.edu/stream/ref.html */ 26 /* and incorporated herein by reference. */ 27 /* As the copyright holder, John McCalpin retains the */ 28 /* right to determine conformity with the Run Rules. */ 29 /* 3b. Results based on modified source code or on runs not in */ 30 /* accordance with the STREAM Run Rules must be clearly */ 31 /* labelled whenever they are published. Examples of */ 32 /* proper labelling include: */ 33 /* "tuned STREAM benchmark results" */ 34 /* "based on a variant of the STREAM benchmark code" */ 35 /* Other comparable, clear and reasonable labelling is */ 36 /* acceptable. */ 37 /* 3c. Submission of results to the STREAM benchmark web site */ 38 /* is encouraged, but not required. */ 39 /* 4. Use of this program or creation of derived works based on this */ 40 /* program constitutes acceptance of these licensing restrictions. */ 41 /* 5. Absolutely no warranty is expressed or implied. */ 42 /*-----------------------------------------------------------------------*/ 43 # include <stdio.h> 44 # include <math.h> 45 # include <limits.h> 46 # include <float.h> 47 # include <sys/time.h> 48 #include <stdlib.h> 49 50 /* INSTRUCTIONS: 51 * 52 * 1) Stream requires a good bit of memory to run. Adjust the 53 * value of 'N' (below) to give a 'timing calibration' of 54 * at least 20 clock-ticks. This will provide rate estimates 55 * that should be good to about 5% precision. 56 */ 57 58 #if !defined(N) 59 # define N 2000000 60 #endif 61 #if !defined(NTIMES) 62 # define NTIMES 50 63 #endif 64 #if !defined(OFFSET) 65 # define OFFSET 0 66 #endif 67 68 /* 69 * 3) Compile the code with full optimization. Many compilers 70 * generate unreasonably bad code before the optimizer tightens 71 * things up. If the results are unreasonably good, on the 72 * other hand, the optimizer might be too smart for me! 73 * 74 * Try compiling with: 75 * cc -O stream_omp.c -o stream_omp 76 * 77 * This is known to work on Cray, SGI, IBM, and Sun machines. 78 * 79 * 80 * 4) Mail the results to mccalpin@cs.virginia.edu 81 * Be sure to include: 82 * a) computer hardware model number and software revision 83 * b) the compiler flags 84 * c) all of the output from the test case. 85 * Thanks! 86 * 87 */ 88 89 # define HLINE "-------------------------------------------------------------\n" 90 91 # if !defined(MIN) 92 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 93 # endif 94 # if !defined(MAX) 95 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 96 # endif 97 98 static double a[N+OFFSET], 99 b[N+OFFSET], 100 c[N+OFFSET]; 101 102 static double avgtime[4] = {0}, maxtime[4] = {0}, 103 mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 104 105 106 static double bytes[1] = { 107 3 * sizeof(double) * N 108 }; 109 110 extern double mysecond(); 111 extern int omp_get_num_threads(); 112 int main() 113 { 114 int quantum, checktick(); 115 register int j, k; 116 double scalar, t, times[4][NTIMES],rate; 117 int size; 118 char *env; 119 FILE *fd; 120 121 env = getenv("OMP_NUM_THREADS"); 122 sscanf(env,"%d",&size); 123 /* --- SETUP --- determine precision and check timing --- */ 124 125 /*printf(HLINE); 126 printf("STREAM version $Revision: 5.9 $\n"); 127 printf(HLINE); */ 128 /* printf("This system uses %d bytes per DOUBLE PRECISION word.\n", 129 BytesPerWord); 130 131 printf(HLINE); 132 #if defined(NO_LONG_LONG) 133 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 134 #else 135 printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); 136 #endif 137 138 printf("Total memory required = %.1f MB.\n", 139 (3.0 * BytesPerWord) * ((double) N / 1048576.0)); 140 printf("Each test is run %d times, but only\n", NTIMES); 141 printf("the *best* time for each is used.\n"); 142 143 printf(HLINE); */ 144 145 146 147 /* Get initial value for system clock. */ 148 #pragma omp parallel for 149 for (j=0; j<N; j++) { 150 a[j] = 1.0; 151 b[j] = 2.0; 152 c[j] = 0.0; 153 } 154 155 /*printf(HLINE);*/ 156 157 if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be " 158 "%d microseconds.\n", quantum);*/ 159 else { 160 ; /* printf("Your clock granularity appears to be " 161 "less than one microsecond.\n");*/ 162 quantum = 1; 163 } 164 165 t = mysecond(); 166 #pragma omp parallel for 167 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 168 t = 1.0E6 * (mysecond() - t); 169 170 /*printf("Each test below will take on the order" 171 " of %d microseconds.\n", (int) t); 172 printf(" (= %d clock ticks)\n", (int) (t/quantum)); 173 printf("Increase the size of the arrays if this shows that\n"); 174 printf("you are not getting at least 20 clock ticks per test.\n"); 175 176 printf(HLINE);*/ 177 178 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 179 180 scalar = 3.0; 181 for (k=0; k<NTIMES; k++) 182 { 183 times[0][k] = mysecond(); 184 #pragma omp parallel for 185 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 186 times[0][k] = mysecond() - times[0][k]; 187 } 188 189 /* --- SUMMARY --- */ 190 191 for (k=1; k<NTIMES; k++) { /* note -- skip first iteration */ 192 for (j=0; j<1; j++) 193 { 194 avgtime[j] = avgtime[j] + times[j][k]; 195 mintime[j] = MIN(mintime[j], times[j][k]); 196 maxtime[j] = MAX(maxtime[j], times[j][k]); 197 } 198 } 199 200 rate = 1.0E-06 * bytes[0]/mintime[0]; 201 202 if (size == 1) { 203 printf("%d %11.4f Rate (MB/s)\n",size, rate); 204 fd = fopen("flops","w"); 205 fprintf(fd,"%g\n",rate); 206 fclose(fd); 207 } else { 208 double prate; 209 fd = fopen("flops","r"); 210 fscanf(fd,"%lg",&prate); 211 fclose(fd); 212 printf("%d %11.4f Rate (MB/s) %g \n", size, rate,rate/prate); 213 } 214 215 return 0; 216 } 217 218 # define M 20 219 220 int checktick() 221 { 222 int i, minDelta, Delta; 223 double t1, t2, timesfound[M]; 224 225 /* Collect a sequence of M unique time values from the system. */ 226 227 for (i = 0; i < M; i++) { 228 t1 = mysecond(); 229 while (((t2=mysecond()) - t1) < 1.0E-6) ; 230 timesfound[i] = t1 = t2; 231 } 232 233 /* 234 * Determine the minimum difference between these M values. 235 * This result will be our estimate (in microseconds) for the 236 * clock granularity. 237 */ 238 239 minDelta = 1000000; 240 for (i = 1; i < M; i++) { 241 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 242 minDelta = MIN(minDelta, MAX(Delta,0)); 243 } 244 245 return(minDelta); 246 } 247 248 249 250 /* A gettimeofday routine to give access to the wall 251 clock timer on most UNIX-like systems. */ 252 253 #include <sys/time.h> 254 255 double mysecond() 256 { 257 struct timeval tp; 258 struct timezone tzp; 259 260 (void) gettimeofday(&tp,&tzp); 261 return ((double) tp.tv_sec + (double) tp.tv_usec * 1.e-6); 262 } 263 264