1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #if defined(HAVE_NUMA) 8 #include <numa.h> 9 #endif 10 #include <float.h> 11 12 #if !defined(SSE2) 13 # define SSE2 1 14 #endif 15 #if !defined(__SSE2__) 16 # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 17 #endif 18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 19 # define PREFETCH_NTA 1 20 #endif 21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 22 # define STATIC_ALLOC 0 23 #endif 24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 25 # define FAULT_TOGETHER 0 26 #endif 27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 28 # define USE_MEMCPY 0 29 #endif 30 31 /* 32 * Program: Stream 33 * Programmer: Joe R. Zagar 34 * Revision: 4.0-BETA, October 24, 1995 35 * Original code developed by John D. McCalpin 36 * 37 * This program measures memory transfer rates in MB/s for simple 38 * computational kernels coded in C. These numbers reveal the quality 39 * of code generation for simple uncacheable kernels as well as showing 40 * the cost of floating-point operations relative to memory accesses. 41 * 42 * INSTRUCTIONS: 43 * 44 * 1) Stream requires a good bit of memory to run. Adjust the 45 * value of 'N' (below) to give a 'timing calibration' of 46 * at least 20 clock-ticks. This will provide rate estimates 47 * that should be good to about 5% precision. 48 */ 49 50 # define N 4000000 51 # define NTIMES 100 52 # define OFFSET 0 53 54 # define HLINE "-------------------------------------------------------------\n" 55 56 # if !defined(MIN) 57 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 58 # endif 59 # if !defined(MAX) 60 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 61 # endif 62 63 #if STATIC_ALLOC 64 double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 65 #endif 66 67 static int checktick(void); 68 static double Second(void); 69 70 int main(int argc,char *argv[]) 71 { 72 const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 73 const double bytes[4] = {2 * sizeof(double) * N, 74 2 * sizeof(double) * N, 75 3 * sizeof(double) * N, 76 3 * sizeof(double) * N}; 77 double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 78 int quantum; 79 int BytesPerWord,j,k,size; 80 PetscInt node = -1; 81 double scalar, t, times[4][NTIMES]; 82 #if !STATIC_ALLOC 83 double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 84 #endif 85 86 PetscCall(PetscInitialize(&argc,&argv,0,help)); 87 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD,&size)); 88 PetscCall(PetscOptionsGetInt(NULL,NULL,"-node",&node,NULL)); 89 /* --- SETUP --- determine precision and check timing --- */ 90 91 PetscPrintf(PETSC_COMM_WORLD,HLINE); 92 BytesPerWord = sizeof(double); 93 PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); 94 95 PetscPrintf(PETSC_COMM_WORLD,HLINE); 96 PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); 97 PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n",(3 * N * BytesPerWord) / 1048576.0); 98 PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 99 PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 100 101 /* Get initial value for system clock. */ 102 103 #if !STATIC_ALLOC 104 if (node == -1) { 105 posix_memalign((void**)&a,64,N*sizeof(double)); 106 posix_memalign((void**)&b,64,N*sizeof(double)); 107 posix_memalign((void**)&c,64,N*sizeof(double)); 108 } else if (node == -2) { 109 a = malloc(N*sizeof(double)); 110 b = malloc(N*sizeof(double)); 111 c = malloc(N*sizeof(double)); 112 #if defined(HAVE_NUMA) 113 } else { 114 a = numa_alloc_onnode(N*sizeof(double),node); 115 b = numa_alloc_onnode(N*sizeof(double),node); 116 c = numa_alloc_onnode(N*sizeof(double),node); 117 #endif 118 } 119 #endif 120 #if FAULT_TOGETHER 121 for (j=0; j<N; j++) { 122 a[j] = 1.0; 123 b[j] = 2.0; 124 c[j] = 0.0; 125 } 126 #else 127 for (j=0; j<N; j++) a[j] = 1.0; 128 for (j=0; j<N; j++) b[j] = 2.0; 129 for (j=0; j<N; j++) c[j] = 0.0; 130 #endif 131 132 PetscPrintf(PETSC_COMM_WORLD,HLINE); 133 134 if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); 135 else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); 136 137 t = Second(); 138 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 139 t = 1.0E6 * (Second() - t); 140 141 PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order of %d microseconds.\n", (int) t); 142 PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); 143 PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 144 PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 145 146 PetscPrintf(PETSC_COMM_WORLD,HLINE); 147 148 PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 149 PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 150 PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 151 PetscPrintf(PETSC_COMM_WORLD,HLINE); 152 153 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 154 155 scalar = 3.0; 156 for (k=0; k<NTIMES; k++) { 157 MPI_Barrier(PETSC_COMM_WORLD); 158 /* ### COPY: c <- a ### */ 159 times[0][k] = Second(); 160 MPI_Barrier(PETSC_COMM_WORLD); 161 #if USE_MEMCPY 162 memcpy(c,a,N*sizeof(double)); 163 #elif SSE2 164 for (j=0; j<N; j+=8) { 165 _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 166 _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 167 _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 168 _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 169 # if PREFETCH_NTA 170 _mm_prefetch(a+j+64,_MM_HINT_NTA); 171 # endif 172 } 173 #else 174 for (j=0; j<N; j++) c[j] = a[j]; 175 #endif 176 MPI_Barrier(PETSC_COMM_WORLD); 177 times[0][k] = Second() - times[0][k]; 178 179 /* ### SCALE: b <- scalar * c ### */ 180 times[1][k] = Second(); 181 MPI_Barrier(PETSC_COMM_WORLD); 182 #if SSE2 183 { 184 __m128d scalar2 = _mm_set1_pd(scalar); 185 for (j=0; j<N; j+=8) { 186 _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 187 _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 188 _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 189 _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 190 # if PREFETCH_NTA 191 _mm_prefetch(c+j+64,_MM_HINT_NTA); 192 # endif 193 } 194 } 195 #else 196 for (j=0; j<N; j++) b[j] = scalar*c[j]; 197 #endif 198 MPI_Barrier(PETSC_COMM_WORLD); 199 times[1][k] = Second() - times[1][k]; 200 201 /* ### ADD: c <- a + b ### */ 202 times[2][k] = Second(); 203 MPI_Barrier(PETSC_COMM_WORLD); 204 #if SSE2 205 { 206 for (j=0; j<N; j+=8) { 207 _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 208 _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 209 _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 210 _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 211 # if PREFETCH_NTA 212 _mm_prefetch(a+j+64,_MM_HINT_NTA); 213 _mm_prefetch(b+j+64,_MM_HINT_NTA); 214 # endif 215 } 216 } 217 #else 218 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 219 #endif 220 MPI_Barrier(PETSC_COMM_WORLD); 221 times[2][k] = Second() - times[2][k]; 222 223 /* ### TRIAD: a <- b + scalar * c ### */ 224 times[3][k] = Second(); 225 MPI_Barrier(PETSC_COMM_WORLD); 226 #if SSE2 227 { 228 __m128d scalar2 = _mm_set1_pd(scalar); 229 for (j=0; j<N; j+=8) { 230 _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 231 _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 232 _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 233 _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 234 # if PREFETCH_NTA 235 _mm_prefetch(b+j+64,_MM_HINT_NTA); 236 _mm_prefetch(c+j+64,_MM_HINT_NTA); 237 # endif 238 } 239 } 240 #else 241 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 242 #endif 243 MPI_Barrier(PETSC_COMM_WORLD); 244 times[3][k] = Second() - times[3][k]; 245 } 246 247 /* --- SUMMARY --- */ 248 249 for (k=0; k<NTIMES; k++) 250 for (j=0; j<4; j++) { 251 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 252 mintime[j] = MIN(mintime[j], times[j][k]); 253 maxtime[j] = MAX(maxtime[j], times[j][k]); 254 } 255 256 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 257 for (j=0; j<4; j++) { 258 rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 259 PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 260 } 261 PetscCall(PetscFinalize()); 262 return 0; 263 } 264 265 static double Second() 266 { 267 double t; 268 PetscTime(&t); 269 return t; 270 } 271 272 #define M 20 273 static int checktick(void) 274 { 275 int i, minDelta, Delta; 276 double t1, t2, timesfound[M]; 277 278 /* Collect a sequence of M unique time values from the system. */ 279 280 for (i = 0; i < M; i++) { 281 t1 = Second(); 282 while ((t2 = Second()) - t1 < 1.0E-6) { 283 } 284 timesfound[i] = t1 = t2; 285 } 286 287 /* 288 * Determine the minimum difference between these M values. 289 * This result will be our estimate (in microseconds) for the 290 * clock granularity. 291 */ 292 293 minDelta = 1000000; 294 for (i = 1; i < M; i++) { 295 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 296 minDelta = MIN(minDelta, MAX(Delta,0)); 297 } 298 299 return minDelta; 300 } 301