1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #include <numa.h> 8 #include <limits.h> 9 #include <float.h> 10 11 #ifndef SSE2 12 # define SSE2 1 13 #endif 14 #ifndef __SSE2__ 15 # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 16 #endif 17 #ifndef PREFETCH_NTA /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 18 # define PREFETCH_NTA 1 19 #endif 20 #ifndef STATIC_ALLOC /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 21 # define STATIC_ALLOC 0 22 #endif 23 #ifndef FAULT_TOGETHER /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 24 # define FAULT_TOGETHER 0 25 #endif 26 #ifndef USE_MEMCPY /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 27 # define USE_MEMCPY 0 28 #endif 29 30 /* 31 * Program: Stream 32 * Programmer: Joe R. Zagar 33 * Revision: 4.0-BETA, October 24, 1995 34 * Original code developed by John D. McCalpin 35 * 36 * This program measures memory transfer rates in MB/s for simple 37 * computational kernels coded in C. These numbers reveal the quality 38 * of code generation for simple uncacheable kernels as well as showing 39 * the cost of floating-point operations relative to memory accesses. 40 * 41 * INSTRUCTIONS: 42 * 43 * 1) Stream requires a good bit of memory to run. Adjust the 44 * value of 'N' (below) to give a 'timing calibration' of 45 * at least 20 clock-ticks. This will provide rate estimates 46 * that should be good to about 5% precision. 47 */ 48 49 # define N 4000000 50 # define NTIMES 100 51 # define OFFSET 0 52 53 # define HLINE "-------------------------------------------------------------\n" 54 55 # ifndef MIN 56 # define MIN(x,y) ((x)<(y)?(x):(y)) 57 # endif 58 # ifndef MAX 59 # define MAX(x,y) ((x)>(y)?(x):(y)) 60 # endif 61 62 #if STATIC_ALLOC 63 double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 64 #endif 65 66 static int checktick(void); 67 static double Second(void); 68 69 int main(int argc,char *argv[]) 70 { 71 const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 72 const double bytes[4] = {2 * sizeof(double) * N, 73 2 * sizeof(double) * N, 74 3 * sizeof(double) * N, 75 3 * sizeof(double) * N}; 76 double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 77 int quantum; 78 int BytesPerWord,j,k,size; 79 PetscInt node = -1; 80 double scalar, t, times[4][NTIMES]; 81 #if !STATIC_ALLOC 82 double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 83 #endif 84 85 PetscInitialize(&argc,&argv,0,help); 86 MPI_Comm_size(PETSC_COMM_WORLD,&size); 87 PetscOptionsGetInt(PETSC_NULL,"-node",&node,PETSC_NULL); 88 /* --- SETUP --- determine precision and check timing --- */ 89 90 PetscPrintf(PETSC_COMM_WORLD,HLINE); 91 BytesPerWord = sizeof(double); 92 PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", 93 BytesPerWord); 94 95 PetscPrintf(PETSC_COMM_WORLD,HLINE); 96 PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n" , N, OFFSET); 97 PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", 98 (3 * N * BytesPerWord) / 1048576.0); 99 PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 100 PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 101 102 /* Get initial value for system clock. */ 103 104 #if !STATIC_ALLOC 105 if (node == -1) { 106 posix_memalign((void**)&a,64,N*sizeof(double)); 107 posix_memalign((void**)&b,64,N*sizeof(double)); 108 posix_memalign((void**)&c,64,N*sizeof(double)); 109 } else if (node == -2) { 110 a = malloc(N*sizeof(double)); 111 b = malloc(N*sizeof(double)); 112 c = malloc(N*sizeof(double)); 113 } else { 114 a = numa_alloc_onnode(N*sizeof(double),node); 115 b = numa_alloc_onnode(N*sizeof(double),node); 116 c = numa_alloc_onnode(N*sizeof(double),node); 117 } 118 #endif 119 #if FAULT_TOGETHER 120 for (j=0; j<N; j++) { 121 a[j] = 1.0; 122 b[j] = 2.0; 123 c[j] = 0.0; 124 } 125 #else 126 for (j=0; j<N; j++) a[j] = 1.0; 127 for (j=0; j<N; j++) b[j] = 2.0; 128 for (j=0; j<N; j++) c[j] = 0.0; 129 #endif 130 131 PetscPrintf(PETSC_COMM_WORLD,HLINE); 132 133 if ( (quantum = checktick()) >= 1) 134 PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be " 135 "%d microseconds.\n", quantum); 136 else 137 PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be " 138 "less than one microsecond.\n"); 139 140 t = Second(); 141 for (j = 0; j < N; j++) 142 a[j] = 2.0E0 * a[j]; 143 t = 1.0E6 * (Second() - t); 144 145 PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" 146 " of %d microseconds.\n", (int) t ); 147 PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum) ); 148 PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 149 PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 150 151 PetscPrintf(PETSC_COMM_WORLD,HLINE); 152 153 PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 154 PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 155 PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 156 PetscPrintf(PETSC_COMM_WORLD,HLINE); 157 158 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 159 160 scalar = 3.0; 161 for (k=0; k<NTIMES; k++) { 162 MPI_Barrier(PETSC_COMM_WORLD); 163 /* ### COPY: c <- a ### */ 164 times[0][k] = Second(); 165 MPI_Barrier(PETSC_COMM_WORLD); 166 #if USE_MEMCPY 167 memcpy(c,a,N*sizeof(double)); 168 #elif SSE2 169 for (j=0; j<N; j+=8) { 170 _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 171 _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 172 _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 173 _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 174 # if PREFETCH_NTA 175 _mm_prefetch(a+j+64,_MM_HINT_NTA); 176 # endif 177 } 178 #else 179 for (j=0; j<N; j++) c[j] = a[j]; 180 #endif 181 MPI_Barrier(PETSC_COMM_WORLD); 182 times[0][k] = Second() - times[0][k]; 183 184 /* ### SCALE: b <- scalar * c ### */ 185 times[1][k] = Second(); 186 MPI_Barrier(PETSC_COMM_WORLD); 187 #if SSE2 188 { 189 __m128d scalar2 = _mm_set1_pd(scalar); 190 for (j=0; j<N; j+=8) { 191 _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 192 _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 193 _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 194 _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 195 # if PREFETCH_NTA 196 _mm_prefetch(c+j+64,_MM_HINT_NTA); 197 # endif 198 } 199 } 200 #else 201 for (j=0; j<N; j++) b[j] = scalar*c[j]; 202 #endif 203 MPI_Barrier(PETSC_COMM_WORLD); 204 times[1][k] = Second() - times[1][k]; 205 206 /* ### ADD: c <- a + b ### */ 207 times[2][k] = Second(); 208 MPI_Barrier(PETSC_COMM_WORLD); 209 #if SSE2 210 { 211 for (j=0; j<N; j+=8) { 212 _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 213 _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 214 _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 215 _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 216 # if PREFETCH_NTA 217 _mm_prefetch(a+j+64,_MM_HINT_NTA); 218 _mm_prefetch(b+j+64,_MM_HINT_NTA); 219 # endif 220 } 221 } 222 #else 223 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 224 #endif 225 MPI_Barrier(PETSC_COMM_WORLD); 226 times[2][k] = Second() - times[2][k]; 227 228 /* ### TRIAD: a <- b + scalar * c ### */ 229 times[3][k] = Second(); 230 MPI_Barrier(PETSC_COMM_WORLD); 231 #if SSE2 232 { 233 __m128d scalar2 = _mm_set1_pd(scalar); 234 for (j=0; j<N; j+=8) { 235 _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 236 _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 237 _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 238 _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 239 # if PREFETCH_NTA 240 _mm_prefetch(b+j+64,_MM_HINT_NTA); 241 _mm_prefetch(c+j+64,_MM_HINT_NTA); 242 # endif 243 } 244 } 245 #else 246 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 247 #endif 248 MPI_Barrier(PETSC_COMM_WORLD); 249 times[3][k] = Second() - times[3][k]; 250 } 251 252 /* --- SUMMARY --- */ 253 254 for (k=0; k<NTIMES; k++) { 255 for (j=0; j<4; j++) { 256 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 257 mintime[j] = MIN(mintime[j], times[j][k]); 258 maxtime[j] = MAX(maxtime[j], times[j][k]); 259 } 260 } 261 262 263 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 264 for (j=0; j<4; j++) { 265 rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 266 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 267 } 268 PetscFinalize(); 269 return 0; 270 } 271 272 static double Second() { 273 double t; 274 PetscTime(t); 275 return t; 276 } 277 278 #define M 20 279 static int checktick() 280 { 281 int i, minDelta, Delta; 282 double t1, t2, timesfound[M]; 283 284 /* Collect a sequence of M unique time values from the system. */ 285 286 for (i = 0; i < M; i++) { 287 t1 = Second(); 288 while ((t2 = Second()) - t1 < 1.0E-6) {} 289 timesfound[i] = t1 = t2; 290 } 291 292 /* 293 * Determine the minimum difference between these M values. 294 * This result will be our estimate (in microseconds) for the 295 * clock granularity. 296 */ 297 298 minDelta = 1000000; 299 for (i = 1; i < M; i++) { 300 Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 301 minDelta = MIN(minDelta, MAX(Delta,0)); 302 } 303 304 return(minDelta); 305 } 306