1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #if defined(HAVE_NUMA) 8 #include <numa.h> 9 #endif 10 #include <limits.h> 11 #include <float.h> 12 13 #if !defined(SSE2) 14 # define SSE2 1 15 #endif 16 #if !defined(__SSE2__) 17 # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 18 #endif 19 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 20 # define PREFETCH_NTA 1 21 #endif 22 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 23 # define STATIC_ALLOC 0 24 #endif 25 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 26 # define FAULT_TOGETHER 0 27 #endif 28 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 29 # define USE_MEMCPY 0 30 #endif 31 32 /* 33 * Program: Stream 34 * Programmer: Joe R. Zagar 35 * Revision: 4.0-BETA, October 24, 1995 36 * Original code developed by John D. McCalpin 37 * 38 * This program measures memory transfer rates in MB/s for simple 39 * computational kernels coded in C. These numbers reveal the quality 40 * of code generation for simple uncacheable kernels as well as showing 41 * the cost of floating-point operations relative to memory accesses. 42 * 43 * INSTRUCTIONS: 44 * 45 * 1) Stream requires a good bit of memory to run. Adjust the 46 * value of 'N' (below) to give a 'timing calibration' of 47 * at least 20 clock-ticks. This will provide rate estimates 48 * that should be good to about 5% precision. 49 */ 50 51 # define N 4000000 52 # define NTIMES 100 53 # define OFFSET 0 54 55 # define HLINE "-------------------------------------------------------------\n" 56 57 # if !defined(MIN) 58 # define MIN(x,y) ((x)<(y)?(x):(y)) 59 # endif 60 # if !defined(MAX) 61 # define MAX(x,y) ((x)>(y)?(x):(y)) 62 # endif 63 64 #if STATIC_ALLOC 65 double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 66 #endif 67 68 static int checktick(void); 69 static double Second(void); 70 71 int main(int argc,char *argv[]) 72 { 73 const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 74 const double bytes[4] = {2 * sizeof(double) * N, 75 2 * sizeof(double) * N, 76 3 * sizeof(double) * N, 77 3 * sizeof(double) * N}; 78 double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 79 int quantum; 80 int BytesPerWord,j,k,size; 81 PetscInt node = -1; 82 double scalar, t, times[4][NTIMES]; 83 #if !STATIC_ALLOC 84 double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 85 #endif 86 87 PetscInitialize(&argc,&argv,0,help); 88 MPI_Comm_size(PETSC_COMM_WORLD,&size); 89 PetscOptionsGetInt(PETSC_NULL,"-node",&node,PETSC_NULL); 90 /* --- SETUP --- determine precision and check timing --- */ 91 92 PetscPrintf(PETSC_COMM_WORLD,HLINE); 93 BytesPerWord = sizeof(double); 94 PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", 95 BytesPerWord); 96 97 PetscPrintf(PETSC_COMM_WORLD,HLINE); 98 PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n" , N, OFFSET); 99 PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", 100 (3 * N * BytesPerWord) / 1048576.0); 101 PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 102 PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 103 104 /* Get initial value for system clock. */ 105 106 #if !STATIC_ALLOC 107 if (node == -1) { 108 posix_memalign((void**)&a,64,N*sizeof(double)); 109 posix_memalign((void**)&b,64,N*sizeof(double)); 110 posix_memalign((void**)&c,64,N*sizeof(double)); 111 } else if (node == -2) { 112 a = malloc(N*sizeof(double)); 113 b = malloc(N*sizeof(double)); 114 c = malloc(N*sizeof(double)); 115 #if defined(HAVE_NUMA) 116 } else { 117 a = numa_alloc_onnode(N*sizeof(double),node); 118 b = numa_alloc_onnode(N*sizeof(double),node); 119 c = numa_alloc_onnode(N*sizeof(double),node); 120 #endif 121 } 122 #endif 123 #if FAULT_TOGETHER 124 for (j=0; j<N; j++) { 125 a[j] = 1.0; 126 b[j] = 2.0; 127 c[j] = 0.0; 128 } 129 #else 130 for (j=0; j<N; j++) a[j] = 1.0; 131 for (j=0; j<N; j++) b[j] = 2.0; 132 for (j=0; j<N; j++) c[j] = 0.0; 133 #endif 134 135 PetscPrintf(PETSC_COMM_WORLD,HLINE); 136 137 if ((quantum = checktick()) >= 1) { 138 PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be " 139 "%d microseconds.\n", quantum); 140 } else { 141 PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be " 142 "less than one microsecond.\n"); 143 } 144 145 t = Second(); 146 for (j = 0; j < N; j++) 147 a[j] = 2.0E0 * a[j]; 148 t = 1.0E6 * (Second() - t); 149 150 PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" 151 " of %d microseconds.\n", (int) t); 152 PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); 153 PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 154 PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 155 156 PetscPrintf(PETSC_COMM_WORLD,HLINE); 157 158 PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 159 PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 160 PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 161 PetscPrintf(PETSC_COMM_WORLD,HLINE); 162 163 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 164 165 scalar = 3.0; 166 for (k=0; k<NTIMES; k++) { 167 MPI_Barrier(PETSC_COMM_WORLD); 168 /* ### COPY: c <- a ### */ 169 times[0][k] = Second(); 170 MPI_Barrier(PETSC_COMM_WORLD); 171 #if USE_MEMCPY 172 memcpy(c,a,N*sizeof(double)); 173 #elif SSE2 174 for (j=0; j<N; j+=8) { 175 _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 176 _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 177 _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 178 _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 179 # if PREFETCH_NTA 180 _mm_prefetch(a+j+64,_MM_HINT_NTA); 181 # endif 182 } 183 #else 184 for (j=0; j<N; j++) c[j] = a[j]; 185 #endif 186 MPI_Barrier(PETSC_COMM_WORLD); 187 times[0][k] = Second() - times[0][k]; 188 189 /* ### SCALE: b <- scalar * c ### */ 190 times[1][k] = Second(); 191 MPI_Barrier(PETSC_COMM_WORLD); 192 #if SSE2 193 { 194 __m128d scalar2 = _mm_set1_pd(scalar); 195 for (j=0; j<N; j+=8) { 196 _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 197 _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 198 _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 199 _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 200 # if PREFETCH_NTA 201 _mm_prefetch(c+j+64,_MM_HINT_NTA); 202 # endif 203 } 204 } 205 #else 206 for (j=0; j<N; j++) b[j] = scalar*c[j]; 207 #endif 208 MPI_Barrier(PETSC_COMM_WORLD); 209 times[1][k] = Second() - times[1][k]; 210 211 /* ### ADD: c <- a + b ### */ 212 times[2][k] = Second(); 213 MPI_Barrier(PETSC_COMM_WORLD); 214 #if SSE2 215 { 216 for (j=0; j<N; j+=8) { 217 _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 218 _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 219 _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 220 _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 221 # if PREFETCH_NTA 222 _mm_prefetch(a+j+64,_MM_HINT_NTA); 223 _mm_prefetch(b+j+64,_MM_HINT_NTA); 224 # endif 225 } 226 } 227 #else 228 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 229 #endif 230 MPI_Barrier(PETSC_COMM_WORLD); 231 times[2][k] = Second() - times[2][k]; 232 233 /* ### TRIAD: a <- b + scalar * c ### */ 234 times[3][k] = Second(); 235 MPI_Barrier(PETSC_COMM_WORLD); 236 #if SSE2 237 { 238 __m128d scalar2 = _mm_set1_pd(scalar); 239 for (j=0; j<N; j+=8) { 240 _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 241 _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 242 _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 243 _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 244 # if PREFETCH_NTA 245 _mm_prefetch(b+j+64,_MM_HINT_NTA); 246 _mm_prefetch(c+j+64,_MM_HINT_NTA); 247 # endif 248 } 249 } 250 #else 251 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 252 #endif 253 MPI_Barrier(PETSC_COMM_WORLD); 254 times[3][k] = Second() - times[3][k]; 255 } 256 257 /* --- SUMMARY --- */ 258 259 for (k=0; k<NTIMES; k++) { 260 for (j=0; j<4; j++) { 261 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 262 mintime[j] = MIN(mintime[j], times[j][k]); 263 maxtime[j] = MAX(maxtime[j], times[j][k]); 264 } 265 } 266 267 268 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 269 for (j=0; j<4; j++) { 270 rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 271 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 272 } 273 PetscFinalize(); 274 return 0; 275 } 276 277 static double Second() 278 { 279 double t; 280 PetscTime(t); 281 return t; 282 } 283 284 #define M 20 285 static int checktick() 286 { 287 int i, minDelta, Delta; 288 double t1, t2, timesfound[M]; 289 290 /* Collect a sequence of M unique time values from the system. */ 291 292 for (i = 0; i < M; i++) { 293 t1 = Second(); 294 while ((t2 = Second()) - t1 < 1.0E-6) {} 295 timesfound[i] = t1 = t2; 296 } 297 298 /* 299 * Determine the minimum difference between these M values. 300 * This result will be our estimate (in microseconds) for the 301 * clock granularity. 302 */ 303 304 minDelta = 1000000; 305 for (i = 1; i < M; i++) { 306 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 307 minDelta = MIN(minDelta, MAX(Delta,0)); 308 } 309 310 return(minDelta); 311 } 312