static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; /* Note: this file has been modified significantly from its original version */ #include #include #include #if defined(HAVE_NUMA) #include #endif #include #if !defined(SSE2) # define SSE2 1 #endif #if !defined(__SSE2__) # error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 #endif #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ # define PREFETCH_NTA 1 #endif #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ # define STATIC_ALLOC 0 #endif #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ # define FAULT_TOGETHER 0 #endif #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ # define USE_MEMCPY 0 #endif /* * Program: Stream * Programmer: Joe R. Zagar * Revision: 4.0-BETA, October 24, 1995 * Original code developed by John D. McCalpin * * This program measures memory transfer rates in MB/s for simple * computational kernels coded in C. These numbers reveal the quality * of code generation for simple uncacheable kernels as well as showing * the cost of floating-point operations relative to memory accesses. * * INSTRUCTIONS: * * 1) Stream requires a good bit of memory to run. Adjust the * value of 'N' (below) to give a 'timing calibration' of * at least 20 clock-ticks. This will provide rate estimates * that should be good to about 5% precision. */ # define N 4000000 # define NTIMES 100 # define OFFSET 0 # define HLINE "-------------------------------------------------------------\n" # if !defined(MIN) # define MIN(x,y) ((x)<(y) ? (x) : (y)) # endif # if !defined(MAX) # define MAX(x,y) ((x)>(y) ? (x) : (y)) # endif #if STATIC_ALLOC double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; #endif static int checktick(void); static double Second(void); int main(int argc,char *argv[]) { const char *label[4] = {"Copy", "Scale","Add", "Triad"}; const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N}; double rmstime[4] = {0},maxtime[4] = {0},mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; int quantum; int BytesPerWord,j,k,size; PetscInt node = -1; double scalar, t, times[4][NTIMES]; PetscErrorCode ierr; #if !STATIC_ALLOC double *PETSC_RESTRICT a,*PETSC_RESTRICT b,*PETSC_RESTRICT c; #endif ierr = PetscInitialize(&argc,&argv,0,help);if (ierr) return ierr; ierr = MPI_Comm_size(PETSC_COMM_WORLD,&size);CHKERRMPI(ierr); ierr = PetscOptionsGetInt(NULL,NULL,"-node",&node,NULL);CHKERRQ(ierr); /* --- SETUP --- determine precision and check timing --- */ PetscPrintf(PETSC_COMM_WORLD,HLINE); BytesPerWord = sizeof(double); PetscPrintf(PETSC_COMM_WORLD,"This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"Array size = %d, Offset = %d\n", N, OFFSET); PetscPrintf(PETSC_COMM_WORLD,"Total memory required = %.1f MB per process.\n",(3 * N * BytesPerWord) / 1048576.0); PetscPrintf(PETSC_COMM_WORLD,"Each test is run %d times, but only\n", NTIMES); PetscPrintf(PETSC_COMM_WORLD,"the *best* time for each is used.\n"); /* Get initial value for system clock. */ #if !STATIC_ALLOC if (node == -1) { posix_memalign((void**)&a,64,N*sizeof(double)); posix_memalign((void**)&b,64,N*sizeof(double)); posix_memalign((void**)&c,64,N*sizeof(double)); } else if (node == -2) { a = malloc(N*sizeof(double)); b = malloc(N*sizeof(double)); c = malloc(N*sizeof(double)); #if defined(HAVE_NUMA) } else { a = numa_alloc_onnode(N*sizeof(double),node); b = numa_alloc_onnode(N*sizeof(double),node); c = numa_alloc_onnode(N*sizeof(double),node); #endif } #endif #if FAULT_TOGETHER for (j=0; j= 1) PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity/precision appears to be %d microseconds.\n", quantum); else PetscPrintf(PETSC_COMM_WORLD,"Your clock granularity appears to be less than one microsecond.\n"); t = Second(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (Second() - t); PetscPrintf(PETSC_COMM_WORLD,"Each test below will take on the order of %d microseconds.\n", (int) t); PetscPrintf(PETSC_COMM_WORLD," (= %d clock ticks)\n", (int) (t/quantum)); PetscPrintf(PETSC_COMM_WORLD,"Increase the size of the arrays if this shows that\n"); PetscPrintf(PETSC_COMM_WORLD,"you are not getting at least 20 clock ticks per test.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); PetscPrintf(PETSC_COMM_WORLD,"WARNING -- The above is only a rough guideline.\n"); PetscPrintf(PETSC_COMM_WORLD,"For best results, please be sure you know the\n"); PetscPrintf(PETSC_COMM_WORLD,"precision of your system timer.\n"); PetscPrintf(PETSC_COMM_WORLD,HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k