1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #if defined(HAVE_NUMA) 8 #include <numa.h> 9 #endif 10 #include <float.h> 11 12 #if !defined(SSE2) 13 # define SSE2 1 14 #endif 15 #if !defined(__SSE2__) 16 # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 17 #endif 18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 19 # define PREFETCH_NTA 1 20 #endif 21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 22 # define STATIC_ALLOC 0 23 #endif 24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 25 # define FAULT_TOGETHER 0 26 #endif 27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 28 # define USE_MEMCPY 0 29 #endif 30 31 /* 32 * Program: Stream 33 * Programmer: Joe R. Zagar 34 * Revision: 4.0-BETA, October 24, 1995 35 * Original code developed by John D. McCalpin 36 * 37 * This program measures memory transfer rates in MB/s for simple 38 * computational kernels coded in C. These numbers reveal the quality 39 * of code generation for simple uncacheable kernels as well as showing 40 * the cost of floating-point operations relative to memory accesses. 41 * 42 * INSTRUCTIONS: 43 * 44 * 1) Stream requires a good bit of memory to run. Adjust the 45 * value of 'N' (below) to give a 'timing calibration' of 46 * at least 20 clock-ticks. This will provide rate estimates 47 * that should be good to about 5% precision. 48 */ 49 50 # define N 4000000 51 # define NTIMES 100 52 # define OFFSET 0 53 54 # define HLINE "-------------------------------------------------------------\n" 55 56 # if !defined(MIN) 57 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 58 # endif 59 # if !defined(MAX) 60 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 61 # endif 62 63 #if STATIC_ALLOC 64 double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 65 #endif 66 67 static int checktick(void); 68 static double Second(void); 69 70 int main(int argc,char *argv[]) 71 { 72 const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 73 const double bytes[4] = {2 * sizeof(double) * N, 74 2 * sizeof(double) * N, 75 3 * sizeof(double) * N, 76 3 * sizeof(double) * N}; 77 double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 78 int quantum; 79 int BytesPerWord,j,k,size; 80 PetscInt node = -1; 81 double scalar, t, times[4][NTIMES]; 82 PetscErrorCode ierr; 83 #if !STATIC_ALLOC 84 double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 85 #endif 86 87 ierr = PetscInitialize(&argc,&argv,0,help);if (ierr) return ierr; 88 ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); 89 ierr = PetscOptionsGetInt(NULL,NULL,"-node",&node,NULL);CHKERRQ(ierr); 90 /* --- SETUP --- determine precision and check timing --- */ 91 92 PetscPrintf(PETSC_COMM_WORLD,HLINE); 93 BytesPerWord = sizeof(double); 94 PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); 95 96 PetscPrintf(PETSC_COMM_WORLD,HLINE); 97 PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); 98 PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n",(3 * N * BytesPerWord) / 1048576.0); 99 PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 100 PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 101 102 /* Get initial value for system clock. */ 103 104 #if !STATIC_ALLOC 105 if (node == -1) { 106 posix_memalign((void**)&a,64,N*sizeof(double)); 107 posix_memalign((void**)&b,64,N*sizeof(double)); 108 posix_memalign((void**)&c,64,N*sizeof(double)); 109 } else if (node == -2) { 110 a = malloc(N*sizeof(double)); 111 b = malloc(N*sizeof(double)); 112 c = malloc(N*sizeof(double)); 113 #if defined(HAVE_NUMA) 114 } else { 115 a = numa_alloc_onnode(N*sizeof(double),node); 116 b = numa_alloc_onnode(N*sizeof(double),node); 117 c = numa_alloc_onnode(N*sizeof(double),node); 118 #endif 119 } 120 #endif 121 #if FAULT_TOGETHER 122 for (j=0; j<N; j++) { 123 a[j] = 1.0; 124 b[j] = 2.0; 125 c[j] = 0.0; 126 } 127 #else 128 for (j=0; j<N; j++) a[j] = 1.0; 129 for (j=0; j<N; j++) b[j] = 2.0; 130 for (j=0; j<N; j++) c[j] = 0.0; 131 #endif 132 133 PetscPrintf(PETSC_COMM_WORLD,HLINE); 134 135 if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); 136 else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); 137 138 t = Second(); 139 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 140 t = 1.0E6 * (Second() - t); 141 142 PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order of %d microseconds.\n", (int) t); 143 PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); 144 PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 145 PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 146 147 PetscPrintf(PETSC_COMM_WORLD,HLINE); 148 149 PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 150 PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 151 PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 152 PetscPrintf(PETSC_COMM_WORLD,HLINE); 153 154 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 155 156 scalar = 3.0; 157 for (k=0; k<NTIMES; k++) { 158 MPI_Barrier(PETSC_COMM_WORLD); 159 /* ### COPY: c <- a ### */ 160 times[0][k] = Second(); 161 MPI_Barrier(PETSC_COMM_WORLD); 162 #if USE_MEMCPY 163 memcpy(c,a,N*sizeof(double)); 164 #elif SSE2 165 for (j=0; j<N; j+=8) { 166 _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 167 _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 168 _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 169 _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 170 # if PREFETCH_NTA 171 _mm_prefetch(a+j+64,_MM_HINT_NTA); 172 # endif 173 } 174 #else 175 for (j=0; j<N; j++) c[j] = a[j]; 176 #endif 177 MPI_Barrier(PETSC_COMM_WORLD); 178 times[0][k] = Second() - times[0][k]; 179 180 /* ### SCALE: b <- scalar * c ### */ 181 times[1][k] = Second(); 182 MPI_Barrier(PETSC_COMM_WORLD); 183 #if SSE2 184 { 185 __m128d scalar2 = _mm_set1_pd(scalar); 186 for (j=0; j<N; j+=8) { 187 _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 188 _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 189 _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 190 _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 191 # if PREFETCH_NTA 192 _mm_prefetch(c+j+64,_MM_HINT_NTA); 193 # endif 194 } 195 } 196 #else 197 for (j=0; j<N; j++) b[j] = scalar*c[j]; 198 #endif 199 MPI_Barrier(PETSC_COMM_WORLD); 200 times[1][k] = Second() - times[1][k]; 201 202 /* ### ADD: c <- a + b ### */ 203 times[2][k] = Second(); 204 MPI_Barrier(PETSC_COMM_WORLD); 205 #if SSE2 206 { 207 for (j=0; j<N; j+=8) { 208 _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 209 _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 210 _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 211 _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 212 # if PREFETCH_NTA 213 _mm_prefetch(a+j+64,_MM_HINT_NTA); 214 _mm_prefetch(b+j+64,_MM_HINT_NTA); 215 # endif 216 } 217 } 218 #else 219 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 220 #endif 221 MPI_Barrier(PETSC_COMM_WORLD); 222 times[2][k] = Second() - times[2][k]; 223 224 /* ### TRIAD: a <- b + scalar * c ### */ 225 times[3][k] = Second(); 226 MPI_Barrier(PETSC_COMM_WORLD); 227 #if SSE2 228 { 229 __m128d scalar2 = _mm_set1_pd(scalar); 230 for (j=0; j<N; j+=8) { 231 _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 232 _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 233 _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 234 _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 235 # if PREFETCH_NTA 236 _mm_prefetch(b+j+64,_MM_HINT_NTA); 237 _mm_prefetch(c+j+64,_MM_HINT_NTA); 238 # endif 239 } 240 } 241 #else 242 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 243 #endif 244 MPI_Barrier(PETSC_COMM_WORLD); 245 times[3][k] = Second() - times[3][k]; 246 } 247 248 /* --- SUMMARY --- */ 249 250 for (k=0; k<NTIMES; k++) 251 for (j=0; j<4; j++) { 252 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 253 mintime[j] = MIN(mintime[j], times[j][k]); 254 maxtime[j] = MAX(maxtime[j], times[j][k]); 255 } 256 257 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 258 for (j=0; j<4; j++) { 259 rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 260 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 261 } 262 ierr = PetscFinalize(); 263 return ierr; 264 } 265 266 static double Second() 267 { 268 double t; 269 PetscTime(&t); 270 return t; 271 } 272 273 #define M 20 274 static int checktick(void) 275 { 276 int i, minDelta, Delta; 277 double t1, t2, timesfound[M]; 278 279 /* Collect a sequence of M unique time values from the system. */ 280 281 for (i = 0; i < M; i++) { 282 t1 = Second(); 283 while ((t2 = Second()) - t1 < 1.0E-6) { 284 } 285 timesfound[i] = t1 = t2; 286 } 287 288 /* 289 * Determine the minimum difference between these M values. 290 * This result will be our estimate (in microseconds) for the 291 * clock granularity. 292 */ 293 294 minDelta = 1000000; 295 for (i = 1; i < M; i++) { 296 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 297 minDelta = MIN(minDelta, MAX(Delta,0)); 298 } 299 300 return(minDelta); 301 } 302