1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #if defined(HAVE_NUMA) 8 #include <numa.h> 9 #endif 10 #include <float.h> 11 12 #if !defined(SSE2) 13 # define SSE2 1 14 #endif 15 #if !defined(__SSE2__) 16 # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 17 #endif 18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 19 # define PREFETCH_NTA 1 20 #endif 21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 22 # define STATIC_ALLOC 0 23 #endif 24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 25 # define FAULT_TOGETHER 0 26 #endif 27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 28 # define USE_MEMCPY 0 29 #endif 30 31 /* 32 * Program: Stream 33 * Programmer: Joe R. Zagar 34 * Revision: 4.0-BETA, October 24, 1995 35 * Original code developed by John D. McCalpin 36 * 37 * This program measures memory transfer rates in MB/s for simple 38 * computational kernels coded in C. These numbers reveal the quality 39 * of code generation for simple uncacheable kernels as well as showing 40 * the cost of floating-point operations relative to memory accesses. 41 * 42 * INSTRUCTIONS: 43 * 44 * 1) Stream requires a good bit of memory to run. Adjust the 45 * value of 'N' (below) to give a 'timing calibration' of 46 * at least 20 clock-ticks. This will provide rate estimates 47 * that should be good to about 5% precision. 48 */ 49 50 # define N 4000000 51 # define NTIMES 100 52 # define OFFSET 0 53 54 # define HLINE "-------------------------------------------------------------\n" 55 56 # if !defined(MIN) 57 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 58 # endif 59 # if !defined(MAX) 60 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 61 # endif 62 63 #if STATIC_ALLOC 64 double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 65 #endif 66 67 static int checktick(void); 68 static double Second(void); 69 70 int main(int argc,char *argv[]) 71 { 72 const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 73 const double bytes[4] = {2 * sizeof(double) * N, 74 2 * sizeof(double) * N, 75 3 * sizeof(double) * N, 76 3 * sizeof(double) * N}; 77 double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 78 int quantum; 79 int BytesPerWord,j,k,size; 80 PetscInt node = -1; 81 double scalar, t, times[4][NTIMES]; 82 #if !STATIC_ALLOC 83 double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 84 #endif 85 86 PetscInitialize(&argc,&argv,0,help); 87 MPI_Comm_size(PETSC_COMM_WORLD,&size); 88 PetscOptionsGetInt(NULL,"-node",&node,NULL); 89 /* --- SETUP --- determine precision and check timing --- */ 90 91 PetscPrintf(PETSC_COMM_WORLD,HLINE); 92 BytesPerWord = sizeof(double); 93 PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", 94 BytesPerWord); 95 96 PetscPrintf(PETSC_COMM_WORLD,HLINE); 97 PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); 98 PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", 99 (3 * N * BytesPerWord) / 1048576.0); 100 PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 101 PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 102 103 /* Get initial value for system clock. */ 104 105 #if !STATIC_ALLOC 106 if (node == -1) { 107 posix_memalign((void**)&a,64,N*sizeof(double)); 108 posix_memalign((void**)&b,64,N*sizeof(double)); 109 posix_memalign((void**)&c,64,N*sizeof(double)); 110 } else if (node == -2) { 111 a = malloc(N*sizeof(double)); 112 b = malloc(N*sizeof(double)); 113 c = malloc(N*sizeof(double)); 114 #if defined(HAVE_NUMA) 115 } else { 116 a = numa_alloc_onnode(N*sizeof(double),node); 117 b = numa_alloc_onnode(N*sizeof(double),node); 118 c = numa_alloc_onnode(N*sizeof(double),node); 119 #endif 120 } 121 #endif 122 #if FAULT_TOGETHER 123 for (j=0; j<N; j++) { 124 a[j] = 1.0; 125 b[j] = 2.0; 126 c[j] = 0.0; 127 } 128 #else 129 for (j=0; j<N; j++) a[j] = 1.0; 130 for (j=0; j<N; j++) b[j] = 2.0; 131 for (j=0; j<N; j++) c[j] = 0.0; 132 #endif 133 134 PetscPrintf(PETSC_COMM_WORLD,HLINE); 135 136 if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); 137 else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); 138 139 t = Second(); 140 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 141 t = 1.0E6 * (Second() - t); 142 143 PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" 144 " of %d microseconds.\n", (int) t); 145 PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); 146 PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 147 PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 148 149 PetscPrintf(PETSC_COMM_WORLD,HLINE); 150 151 PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 152 PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 153 PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 154 PetscPrintf(PETSC_COMM_WORLD,HLINE); 155 156 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 157 158 scalar = 3.0; 159 for (k=0; k<NTIMES; k++) { 160 MPI_Barrier(PETSC_COMM_WORLD); 161 /* ### COPY: c <- a ### */ 162 times[0][k] = Second(); 163 MPI_Barrier(PETSC_COMM_WORLD); 164 #if USE_MEMCPY 165 memcpy(c,a,N*sizeof(double)); 166 #elif SSE2 167 for (j=0; j<N; j+=8) { 168 _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 169 _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 170 _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 171 _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 172 # if PREFETCH_NTA 173 _mm_prefetch(a+j+64,_MM_HINT_NTA); 174 # endif 175 } 176 #else 177 for (j=0; j<N; j++) c[j] = a[j]; 178 #endif 179 MPI_Barrier(PETSC_COMM_WORLD); 180 times[0][k] = Second() - times[0][k]; 181 182 /* ### SCALE: b <- scalar * c ### */ 183 times[1][k] = Second(); 184 MPI_Barrier(PETSC_COMM_WORLD); 185 #if SSE2 186 { 187 __m128d scalar2 = _mm_set1_pd(scalar); 188 for (j=0; j<N; j+=8) { 189 _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 190 _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 191 _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 192 _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 193 # if PREFETCH_NTA 194 _mm_prefetch(c+j+64,_MM_HINT_NTA); 195 # endif 196 } 197 } 198 #else 199 for (j=0; j<N; j++) b[j] = scalar*c[j]; 200 #endif 201 MPI_Barrier(PETSC_COMM_WORLD); 202 times[1][k] = Second() - times[1][k]; 203 204 /* ### ADD: c <- a + b ### */ 205 times[2][k] = Second(); 206 MPI_Barrier(PETSC_COMM_WORLD); 207 #if SSE2 208 { 209 for (j=0; j<N; j+=8) { 210 _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 211 _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 212 _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 213 _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 214 # if PREFETCH_NTA 215 _mm_prefetch(a+j+64,_MM_HINT_NTA); 216 _mm_prefetch(b+j+64,_MM_HINT_NTA); 217 # endif 218 } 219 } 220 #else 221 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 222 #endif 223 MPI_Barrier(PETSC_COMM_WORLD); 224 times[2][k] = Second() - times[2][k]; 225 226 /* ### TRIAD: a <- b + scalar * c ### */ 227 times[3][k] = Second(); 228 MPI_Barrier(PETSC_COMM_WORLD); 229 #if SSE2 230 { 231 __m128d scalar2 = _mm_set1_pd(scalar); 232 for (j=0; j<N; j+=8) { 233 _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 234 _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 235 _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 236 _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 237 # if PREFETCH_NTA 238 _mm_prefetch(b+j+64,_MM_HINT_NTA); 239 _mm_prefetch(c+j+64,_MM_HINT_NTA); 240 # endif 241 } 242 } 243 #else 244 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 245 #endif 246 MPI_Barrier(PETSC_COMM_WORLD); 247 times[3][k] = Second() - times[3][k]; 248 } 249 250 /* --- SUMMARY --- */ 251 252 for (k=0; k<NTIMES; k++) 253 for (j=0; j<4; j++) { 254 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 255 mintime[j] = MIN(mintime[j], times[j][k]); 256 maxtime[j] = MAX(maxtime[j], times[j][k]); 257 } 258 259 260 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 261 for (j=0; j<4; j++) { 262 rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 263 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 264 } 265 PetscFinalize(); 266 return 0; 267 } 268 269 static double Second() 270 { 271 double t; 272 PetscTime(&t); 273 return t; 274 } 275 276 #define M 20 277 static int checktick() 278 { 279 int i, minDelta, Delta; 280 double t1, t2, timesfound[M]; 281 282 /* Collect a sequence of M unique time values from the system. */ 283 284 for (i = 0; i < M; i++) { 285 t1 = Second(); 286 while ((t2 = Second()) - t1 < 1.0E-6) { 287 } 288 timesfound[i] = t1 = t2; 289 } 290 291 /* 292 * Determine the minimum difference between these M values. 293 * This result will be our estimate (in microseconds) for the 294 * clock granularity. 295 */ 296 297 minDelta = 1000000; 298 for (i = 1; i < M; i++) { 299 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 300 minDelta = MIN(minDelta, MAX(Delta,0)); 301 } 302 303 return(minDelta); 304 } 305