1b8a1809bSJed Brown static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2b8a1809bSJed Brown 3b8a1809bSJed Brown /* Note: this file has been modified significantly from its original version */ 4b8a1809bSJed Brown #include <emmintrin.h> 5b8a1809bSJed Brown #include <petsctime.h> 6b8a1809bSJed Brown #include <petscsys.h> 7fa8572e2SBarry Smith #if defined(HAVE_NUMA) 8b8a1809bSJed Brown #include <numa.h> 9fa8572e2SBarry Smith #endif 10b8a1809bSJed Brown #include <limits.h> 11b8a1809bSJed Brown #include <float.h> 12b8a1809bSJed Brown 13b8a1809bSJed Brown #ifndef SSE2 14b8a1809bSJed Brown # define SSE2 1 15b8a1809bSJed Brown #endif 16b8a1809bSJed Brown #ifndef __SSE2__ 17b8a1809bSJed Brown # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 18b8a1809bSJed Brown #endif 19b8a1809bSJed Brown #ifndef PREFETCH_NTA /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 20b8a1809bSJed Brown # define PREFETCH_NTA 1 21b8a1809bSJed Brown #endif 22b8a1809bSJed Brown #ifndef STATIC_ALLOC /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 23b8a1809bSJed Brown # define STATIC_ALLOC 0 24b8a1809bSJed Brown #endif 25b8a1809bSJed Brown #ifndef FAULT_TOGETHER /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 26b8a1809bSJed Brown # define FAULT_TOGETHER 0 27b8a1809bSJed Brown #endif 28b8a1809bSJed Brown #ifndef USE_MEMCPY /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 29b8a1809bSJed Brown # define USE_MEMCPY 0 30b8a1809bSJed Brown #endif 31b8a1809bSJed Brown 32b8a1809bSJed Brown /* 33b8a1809bSJed Brown * Program: Stream 34b8a1809bSJed Brown * Programmer: Joe R. Zagar 35b8a1809bSJed Brown * Revision: 4.0-BETA, October 24, 1995 36b8a1809bSJed Brown * Original code developed by John D. McCalpin 37b8a1809bSJed Brown * 38b8a1809bSJed Brown * This program measures memory transfer rates in MB/s for simple 39b8a1809bSJed Brown * computational kernels coded in C. These numbers reveal the quality 40b8a1809bSJed Brown * of code generation for simple uncacheable kernels as well as showing 41b8a1809bSJed Brown * the cost of floating-point operations relative to memory accesses. 42b8a1809bSJed Brown * 43b8a1809bSJed Brown * INSTRUCTIONS: 44b8a1809bSJed Brown * 45b8a1809bSJed Brown * 1) Stream requires a good bit of memory to run. Adjust the 46b8a1809bSJed Brown * value of 'N' (below) to give a 'timing calibration' of 47b8a1809bSJed Brown * at least 20 clock-ticks. This will provide rate estimates 48b8a1809bSJed Brown * that should be good to about 5% precision. 49b8a1809bSJed Brown */ 50b8a1809bSJed Brown 51b8a1809bSJed Brown # define N 4000000 52b8a1809bSJed Brown # define NTIMES 100 53b8a1809bSJed Brown # define OFFSET 0 54b8a1809bSJed Brown 55b8a1809bSJed Brown # define HLINE "-------------------------------------------------------------\n" 56b8a1809bSJed Brown 57b8a1809bSJed Brown # ifndef MIN 58b8a1809bSJed Brown # define MIN(x,y) ((x)<(y)?(x):(y)) 59b8a1809bSJed Brown # endif 60b8a1809bSJed Brown # ifndef MAX 61b8a1809bSJed Brown # define MAX(x,y) ((x)>(y)?(x):(y)) 62b8a1809bSJed Brown # endif 63b8a1809bSJed Brown 64b8a1809bSJed Brown #if STATIC_ALLOC 65b8a1809bSJed Brown double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 66b8a1809bSJed Brown #endif 67b8a1809bSJed Brown 68b8a1809bSJed Brown static int checktick(void); 69b8a1809bSJed Brown static double Second(void); 70b8a1809bSJed Brown 71b8a1809bSJed Brown int main(int argc,char *argv[]) 72b8a1809bSJed Brown { 73b8a1809bSJed Brown const char *label[4] = {"Copy", "Scale","Add", "Triad"}; 74b8a1809bSJed Brown const double bytes[4] = {2 * sizeof(double) * N, 75b8a1809bSJed Brown 2 * sizeof(double) * N, 76b8a1809bSJed Brown 3 * sizeof(double) * N, 77b8a1809bSJed Brown 3 * sizeof(double) * N}; 78b8a1809bSJed Brown double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 79b8a1809bSJed Brown int quantum; 80b8a1809bSJed Brown int BytesPerWord,j,k,size; 81b8a1809bSJed Brown PetscInt node = -1; 82b8a1809bSJed Brown double scalar, t, times[4][NTIMES]; 83b8a1809bSJed Brown #if !STATIC_ALLOC 84b8a1809bSJed Brown double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; 85b8a1809bSJed Brown #endif 86b8a1809bSJed Brown 87b8a1809bSJed Brown PetscInitialize(&argc,&argv,0,help); 88b8a1809bSJed Brown MPI_Comm_size(PETSC_COMM_WORLD,&size); 89b8a1809bSJed Brown PetscOptionsGetInt(PETSC_NULL,"-node",&node,PETSC_NULL); 90b8a1809bSJed Brown /* --- SETUP --- determine precision and check timing --- */ 91b8a1809bSJed Brown 92b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 93b8a1809bSJed Brown BytesPerWord = sizeof(double); 94b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n", 95b8a1809bSJed Brown BytesPerWord); 96b8a1809bSJed Brown 97b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 98b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n" , N, OFFSET); 99b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n", 100b8a1809bSJed Brown (3 * N * BytesPerWord) / 1048576.0); 101b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); 102b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); 103b8a1809bSJed Brown 104b8a1809bSJed Brown /* Get initial value for system clock. */ 105b8a1809bSJed Brown 106b8a1809bSJed Brown #if !STATIC_ALLOC 107b8a1809bSJed Brown if (node == -1) { 108b8a1809bSJed Brown posix_memalign((void**)&a,64,N*sizeof(double)); 109b8a1809bSJed Brown posix_memalign((void**)&b,64,N*sizeof(double)); 110b8a1809bSJed Brown posix_memalign((void**)&c,64,N*sizeof(double)); 111b8a1809bSJed Brown } else if (node == -2) { 112b8a1809bSJed Brown a = malloc(N*sizeof(double)); 113b8a1809bSJed Brown b = malloc(N*sizeof(double)); 114b8a1809bSJed Brown c = malloc(N*sizeof(double)); 115fa8572e2SBarry Smith #if defined(HAVE_NUMA) 116b8a1809bSJed Brown } else { 117b8a1809bSJed Brown a = numa_alloc_onnode(N*sizeof(double),node); 118b8a1809bSJed Brown b = numa_alloc_onnode(N*sizeof(double),node); 119b8a1809bSJed Brown c = numa_alloc_onnode(N*sizeof(double),node); 120fa8572e2SBarry Smith #endif 121b8a1809bSJed Brown } 122b8a1809bSJed Brown #endif 123b8a1809bSJed Brown #if FAULT_TOGETHER 124b8a1809bSJed Brown for (j=0; j<N; j++) { 125b8a1809bSJed Brown a[j] = 1.0; 126b8a1809bSJed Brown b[j] = 2.0; 127b8a1809bSJed Brown c[j] = 0.0; 128b8a1809bSJed Brown } 129b8a1809bSJed Brown #else 130b8a1809bSJed Brown for (j=0; j<N; j++) a[j] = 1.0; 131b8a1809bSJed Brown for (j=0; j<N; j++) b[j] = 2.0; 132b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = 0.0; 133b8a1809bSJed Brown #endif 134b8a1809bSJed Brown 135b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 136b8a1809bSJed Brown 137db4deed7SKarl Rupp if ( (quantum = checktick()) >= 1) { 138b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be " 139b8a1809bSJed Brown "%d microseconds.\n", quantum); 140db4deed7SKarl Rupp } else { 141b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be " 142b8a1809bSJed Brown "less than one microsecond.\n"); 143db4deed7SKarl Rupp } 144b8a1809bSJed Brown 145b8a1809bSJed Brown t = Second(); 146b8a1809bSJed Brown for (j = 0; j < N; j++) 147b8a1809bSJed Brown a[j] = 2.0E0 * a[j]; 148b8a1809bSJed Brown t = 1.0E6 * (Second() - t); 149b8a1809bSJed Brown 150b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order" 151b8a1809bSJed Brown " of %d microseconds.\n", (int) t ); 152b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum) ); 153b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); 154b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); 155b8a1809bSJed Brown 156b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 157b8a1809bSJed Brown 158b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); 159b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); 160b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); 161b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,HLINE); 162b8a1809bSJed Brown 163b8a1809bSJed Brown /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 164b8a1809bSJed Brown 165b8a1809bSJed Brown scalar = 3.0; 166b8a1809bSJed Brown for (k=0; k<NTIMES; k++) { 167106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 168b8a1809bSJed Brown /* ### COPY: c <- a ### */ 169b8a1809bSJed Brown times[0][k] = Second(); 170106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 171b8a1809bSJed Brown #if USE_MEMCPY 172b8a1809bSJed Brown memcpy(c,a,N*sizeof(double)); 173b8a1809bSJed Brown #elif SSE2 174b8a1809bSJed Brown for (j=0; j<N; j+=8) { 175b8a1809bSJed Brown _mm_stream_pd(c+j+0,_mm_load_pd(a+j+0)); 176b8a1809bSJed Brown _mm_stream_pd(c+j+2,_mm_load_pd(a+j+2)); 177b8a1809bSJed Brown _mm_stream_pd(c+j+4,_mm_load_pd(a+j+4)); 178b8a1809bSJed Brown _mm_stream_pd(c+j+6,_mm_load_pd(a+j+6)); 179b8a1809bSJed Brown # if PREFETCH_NTA 180b8a1809bSJed Brown _mm_prefetch(a+j+64,_MM_HINT_NTA); 181b8a1809bSJed Brown # endif 182b8a1809bSJed Brown } 183b8a1809bSJed Brown #else 184b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = a[j]; 185b8a1809bSJed Brown #endif 186106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 187b8a1809bSJed Brown times[0][k] = Second() - times[0][k]; 188b8a1809bSJed Brown 189b8a1809bSJed Brown /* ### SCALE: b <- scalar * c ### */ 190b8a1809bSJed Brown times[1][k] = Second(); 191106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 192b8a1809bSJed Brown #if SSE2 193b8a1809bSJed Brown { 194b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 195b8a1809bSJed Brown for (j=0; j<N; j+=8) { 196b8a1809bSJed Brown _mm_stream_pd(b+j+0,_mm_mul_pd(scalar2,_mm_load_pd(c+j+0))); 197b8a1809bSJed Brown _mm_stream_pd(b+j+2,_mm_mul_pd(scalar2,_mm_load_pd(c+j+2))); 198b8a1809bSJed Brown _mm_stream_pd(b+j+4,_mm_mul_pd(scalar2,_mm_load_pd(c+j+4))); 199b8a1809bSJed Brown _mm_stream_pd(b+j+6,_mm_mul_pd(scalar2,_mm_load_pd(c+j+6))); 200b8a1809bSJed Brown # if PREFETCH_NTA 201b8a1809bSJed Brown _mm_prefetch(c+j+64,_MM_HINT_NTA); 202b8a1809bSJed Brown # endif 203b8a1809bSJed Brown } 204b8a1809bSJed Brown } 205b8a1809bSJed Brown #else 206b8a1809bSJed Brown for (j=0; j<N; j++) b[j] = scalar*c[j]; 207b8a1809bSJed Brown #endif 208106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 209b8a1809bSJed Brown times[1][k] = Second() - times[1][k]; 210b8a1809bSJed Brown 211b8a1809bSJed Brown /* ### ADD: c <- a + b ### */ 212b8a1809bSJed Brown times[2][k] = Second(); 213106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 214b8a1809bSJed Brown #if SSE2 215b8a1809bSJed Brown { 216b8a1809bSJed Brown for (j=0; j<N; j+=8) { 217b8a1809bSJed Brown _mm_stream_pd(c+j+0,_mm_add_pd(_mm_load_pd(a+j+0),_mm_load_pd(b+j+0))); 218b8a1809bSJed Brown _mm_stream_pd(c+j+2,_mm_add_pd(_mm_load_pd(a+j+2),_mm_load_pd(b+j+2))); 219b8a1809bSJed Brown _mm_stream_pd(c+j+4,_mm_add_pd(_mm_load_pd(a+j+4),_mm_load_pd(b+j+4))); 220b8a1809bSJed Brown _mm_stream_pd(c+j+6,_mm_add_pd(_mm_load_pd(a+j+6),_mm_load_pd(b+j+6))); 221b8a1809bSJed Brown # if PREFETCH_NTA 222b8a1809bSJed Brown _mm_prefetch(a+j+64,_MM_HINT_NTA); 223b8a1809bSJed Brown _mm_prefetch(b+j+64,_MM_HINT_NTA); 224b8a1809bSJed Brown # endif 225b8a1809bSJed Brown } 226b8a1809bSJed Brown } 227b8a1809bSJed Brown #else 228b8a1809bSJed Brown for (j=0; j<N; j++) c[j] = a[j]+b[j]; 229b8a1809bSJed Brown #endif 230106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 231b8a1809bSJed Brown times[2][k] = Second() - times[2][k]; 232b8a1809bSJed Brown 233b8a1809bSJed Brown /* ### TRIAD: a <- b + scalar * c ### */ 234b8a1809bSJed Brown times[3][k] = Second(); 235106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 236b8a1809bSJed Brown #if SSE2 237b8a1809bSJed Brown { 238b8a1809bSJed Brown __m128d scalar2 = _mm_set1_pd(scalar); 239b8a1809bSJed Brown for (j=0; j<N; j+=8) { 240b8a1809bSJed Brown _mm_stream_pd(a+j+0,_mm_add_pd(_mm_load_pd(b+j+0),_mm_mul_pd(scalar2,_mm_load_pd(c+j+0)))); 241b8a1809bSJed Brown _mm_stream_pd(a+j+2,_mm_add_pd(_mm_load_pd(b+j+2),_mm_mul_pd(scalar2,_mm_load_pd(c+j+2)))); 242b8a1809bSJed Brown _mm_stream_pd(a+j+4,_mm_add_pd(_mm_load_pd(b+j+4),_mm_mul_pd(scalar2,_mm_load_pd(c+j+4)))); 243b8a1809bSJed Brown _mm_stream_pd(a+j+6,_mm_add_pd(_mm_load_pd(b+j+6),_mm_mul_pd(scalar2,_mm_load_pd(c+j+6)))); 244b8a1809bSJed Brown # if PREFETCH_NTA 245b8a1809bSJed Brown _mm_prefetch(b+j+64,_MM_HINT_NTA); 246b8a1809bSJed Brown _mm_prefetch(c+j+64,_MM_HINT_NTA); 247b8a1809bSJed Brown # endif 248b8a1809bSJed Brown } 249b8a1809bSJed Brown } 250b8a1809bSJed Brown #else 251b8a1809bSJed Brown for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 252b8a1809bSJed Brown #endif 253106b1c52SJed Brown MPI_Barrier(PETSC_COMM_WORLD); 254b8a1809bSJed Brown times[3][k] = Second() - times[3][k]; 255b8a1809bSJed Brown } 256b8a1809bSJed Brown 257b8a1809bSJed Brown /* --- SUMMARY --- */ 258b8a1809bSJed Brown 259b8a1809bSJed Brown for (k=0; k<NTIMES; k++) { 260b8a1809bSJed Brown for (j=0; j<4; j++) { 261b8a1809bSJed Brown rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 262b8a1809bSJed Brown mintime[j] = MIN(mintime[j], times[j][k]); 263b8a1809bSJed Brown maxtime[j] = MAX(maxtime[j], times[j][k]); 264b8a1809bSJed Brown } 265b8a1809bSJed Brown } 266b8a1809bSJed Brown 267b8a1809bSJed Brown 268b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"%8s: %11s %11s %11s %11s %11s\n","Function","Rate (MB/s)","Total (MB/s)","RMS time","Min time","Max time"); 269b8a1809bSJed Brown for (j=0; j<4; j++) { 270b8a1809bSJed Brown rmstime[j] = sqrt(rmstime[j]/(double)NTIMES); 271b8a1809bSJed Brown PetscPrintf(PETSC_COMM_WORLD,"%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06*bytes[j]/mintime[j], size*1.0e-06*bytes[j]/mintime[j], rmstime[j], mintime[j], maxtime[j]); 272b8a1809bSJed Brown } 273b8a1809bSJed Brown PetscFinalize(); 274b8a1809bSJed Brown return 0; 275b8a1809bSJed Brown } 276b8a1809bSJed Brown 277*a6dfd86eSKarl Rupp static double Second() 278*a6dfd86eSKarl Rupp { 279b8a1809bSJed Brown double t; 280b8a1809bSJed Brown PetscTime(t); 281b8a1809bSJed Brown return t; 282b8a1809bSJed Brown } 283b8a1809bSJed Brown 284b8a1809bSJed Brown #define M 20 285b8a1809bSJed Brown static int checktick() 286b8a1809bSJed Brown { 287b8a1809bSJed Brown int i, minDelta, Delta; 288b8a1809bSJed Brown double t1, t2, timesfound[M]; 289b8a1809bSJed Brown 290b8a1809bSJed Brown /* Collect a sequence of M unique time values from the system. */ 291b8a1809bSJed Brown 292b8a1809bSJed Brown for (i = 0; i < M; i++) { 293b8a1809bSJed Brown t1 = Second(); 294106b1c52SJed Brown while ((t2 = Second()) - t1 < 1.0E-6) {} 295b8a1809bSJed Brown timesfound[i] = t1 = t2; 296b8a1809bSJed Brown } 297b8a1809bSJed Brown 298b8a1809bSJed Brown /* 299b8a1809bSJed Brown * Determine the minimum difference between these M values. 300b8a1809bSJed Brown * This result will be our estimate (in microseconds) for the 301b8a1809bSJed Brown * clock granularity. 302b8a1809bSJed Brown */ 303b8a1809bSJed Brown 304b8a1809bSJed Brown minDelta = 1000000; 305b8a1809bSJed Brown for (i = 1; i < M; i++) { 306b8a1809bSJed Brown Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); 307b8a1809bSJed Brown minDelta = MIN(minDelta, MAX(Delta,0)); 308b8a1809bSJed Brown } 309b8a1809bSJed Brown 310b8a1809bSJed Brown return(minDelta); 311b8a1809bSJed Brown } 312