1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n"; 2 3 /* Note: this file has been modified significantly from its original version */ 4 #include <emmintrin.h> 5 #include <petsctime.h> 6 #include <petscsys.h> 7 #if defined(HAVE_NUMA) 8 #include <numa.h> 9 #endif 10 #include <float.h> 11 12 #if !defined(SSE2) 13 #define SSE2 1 14 #endif 15 #if !defined(__SSE2__) 16 #error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0 17 #endif 18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */ 19 #define PREFETCH_NTA 1 20 #endif 21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */ 22 #define STATIC_ALLOC 0 23 #endif 24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */ 25 #define FAULT_TOGETHER 0 26 #endif 27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */ 28 #define USE_MEMCPY 0 29 #endif 30 31 /* 32 * Program: Stream 33 * Programmer: Joe R. Zagar 34 * Revision: 4.0-BETA, October 24, 1995 35 * Original code developed by John D. McCalpin 36 * 37 * This program measures memory transfer rates in MB/s for simple 38 * computational kernels coded in C. These numbers reveal the quality 39 * of code generation for simple uncacheable kernels as well as showing 40 * the cost of floating-point operations relative to memory accesses. 41 * 42 * INSTRUCTIONS: 43 * 44 * 1) Stream requires a good bit of memory to run. Adjust the 45 * value of 'N' (below) to give a 'timing calibration' of 46 * at least 20 clock-ticks. This will provide rate estimates 47 * that should be good to about 5% precision. 48 */ 49 50 #define N 4000000 51 #define NTIMES 100 52 #define OFFSET 0 53 54 #define HLINE "-------------------------------------------------------------\n" 55 56 #if !defined(MIN) 57 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 58 #endif 59 #if !defined(MAX) 60 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 61 #endif 62 63 #if STATIC_ALLOC 64 double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET]; 65 #endif 66 67 static int checktick(void); 68 static double Second(void); 69 70 int main(int argc, char *argv[]) 71 { 72 const char *label[4] = {"Copy", "Scale", "Add", "Triad"}; 73 const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N}; 74 double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX}; 75 int quantum; 76 int BytesPerWord, j, k, size; 77 PetscInt node = -1; 78 double scalar, t, times[4][NTIMES]; 79 #if !STATIC_ALLOC 80 double *PETSC_RESTRICT a, *PETSC_RESTRICT b, *PETSC_RESTRICT c; 81 #endif 82 83 PetscCall(PetscInitialize(&argc, &argv, 0, help)); 84 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size)); 85 PetscCall(PetscOptionsGetInt(NULL, NULL, "-node", &node, NULL)); 86 /* --- SETUP --- determine precision and check timing --- */ 87 88 PetscPrintf(PETSC_COMM_WORLD, HLINE); 89 BytesPerWord = sizeof(double); 90 PetscPrintf(PETSC_COMM_WORLD, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); 91 92 PetscPrintf(PETSC_COMM_WORLD, HLINE); 93 PetscPrintf(PETSC_COMM_WORLD, "Array size = %d, Offset = %d\n", N, OFFSET); 94 PetscPrintf(PETSC_COMM_WORLD, "Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0); 95 PetscPrintf(PETSC_COMM_WORLD, "Each test is run %d times, but only\n", NTIMES); 96 PetscPrintf(PETSC_COMM_WORLD, "the *best* time for each is used.\n"); 97 98 /* Get initial value for system clock. */ 99 100 #if !STATIC_ALLOC 101 if (node == -1) { 102 posix_memalign((void **)&a, 64, N * sizeof(double)); 103 posix_memalign((void **)&b, 64, N * sizeof(double)); 104 posix_memalign((void **)&c, 64, N * sizeof(double)); 105 } else if (node == -2) { 106 a = malloc(N * sizeof(double)); 107 b = malloc(N * sizeof(double)); 108 c = malloc(N * sizeof(double)); 109 #if defined(HAVE_NUMA) 110 } else { 111 a = numa_alloc_onnode(N * sizeof(double), node); 112 b = numa_alloc_onnode(N * sizeof(double), node); 113 c = numa_alloc_onnode(N * sizeof(double), node); 114 #endif 115 } 116 #endif 117 #if FAULT_TOGETHER 118 for (j = 0; j < N; j++) { 119 a[j] = 1.0; 120 b[j] = 2.0; 121 c[j] = 0.0; 122 } 123 #else 124 for (j = 0; j < N; j++) a[j] = 1.0; 125 for (j = 0; j < N; j++) b[j] = 2.0; 126 for (j = 0; j < N; j++) c[j] = 0.0; 127 #endif 128 129 PetscPrintf(PETSC_COMM_WORLD, HLINE); 130 131 if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity/precision appears to be %d microseconds.\n", quantum); 132 else PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity appears to be less than one microsecond.\n"); 133 134 t = Second(); 135 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 136 t = 1.0E6 * (Second() - t); 137 138 PetscPrintf(PETSC_COMM_WORLD, "Each test below will take on the order of %d microseconds.\n", (int)t); 139 PetscPrintf(PETSC_COMM_WORLD, " (= %d clock ticks)\n", (int)(t / quantum)); 140 PetscPrintf(PETSC_COMM_WORLD, "Increase the size of the arrays if this shows that\n"); 141 PetscPrintf(PETSC_COMM_WORLD, "you are not getting at least 20 clock ticks per test.\n"); 142 143 PetscPrintf(PETSC_COMM_WORLD, HLINE); 144 145 PetscPrintf(PETSC_COMM_WORLD, "WARNING -- The above is only a rough guideline.\n"); 146 PetscPrintf(PETSC_COMM_WORLD, "For best results, please be sure you know the\n"); 147 PetscPrintf(PETSC_COMM_WORLD, "precision of your system timer.\n"); 148 PetscPrintf(PETSC_COMM_WORLD, HLINE); 149 150 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 151 152 scalar = 3.0; 153 for (k = 0; k < NTIMES; k++) { 154 MPI_Barrier(PETSC_COMM_WORLD); 155 /* ### COPY: c <- a ### */ 156 times[0][k] = Second(); 157 MPI_Barrier(PETSC_COMM_WORLD); 158 #if USE_MEMCPY 159 memcpy(c, a, N * sizeof(double)); 160 #elif SSE2 161 for (j = 0; j < N; j += 8) { 162 _mm_stream_pd(c + j + 0, _mm_load_pd(a + j + 0)); 163 _mm_stream_pd(c + j + 2, _mm_load_pd(a + j + 2)); 164 _mm_stream_pd(c + j + 4, _mm_load_pd(a + j + 4)); 165 _mm_stream_pd(c + j + 6, _mm_load_pd(a + j + 6)); 166 #if PREFETCH_NTA 167 _mm_prefetch(a + j + 64, _MM_HINT_NTA); 168 #endif 169 } 170 #else 171 for (j = 0; j < N; j++) c[j] = a[j]; 172 #endif 173 MPI_Barrier(PETSC_COMM_WORLD); 174 times[0][k] = Second() - times[0][k]; 175 176 /* ### SCALE: b <- scalar * c ### */ 177 times[1][k] = Second(); 178 MPI_Barrier(PETSC_COMM_WORLD); 179 #if SSE2 180 { 181 __m128d scalar2 = _mm_set1_pd(scalar); 182 for (j = 0; j < N; j += 8) { 183 _mm_stream_pd(b + j + 0, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0))); 184 _mm_stream_pd(b + j + 2, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2))); 185 _mm_stream_pd(b + j + 4, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4))); 186 _mm_stream_pd(b + j + 6, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6))); 187 #if PREFETCH_NTA 188 _mm_prefetch(c + j + 64, _MM_HINT_NTA); 189 #endif 190 } 191 } 192 #else 193 for (j = 0; j < N; j++) b[j] = scalar * c[j]; 194 #endif 195 MPI_Barrier(PETSC_COMM_WORLD); 196 times[1][k] = Second() - times[1][k]; 197 198 /* ### ADD: c <- a + b ### */ 199 times[2][k] = Second(); 200 MPI_Barrier(PETSC_COMM_WORLD); 201 #if SSE2 202 { 203 for (j = 0; j < N; j += 8) { 204 _mm_stream_pd(c + j + 0, _mm_add_pd(_mm_load_pd(a + j + 0), _mm_load_pd(b + j + 0))); 205 _mm_stream_pd(c + j + 2, _mm_add_pd(_mm_load_pd(a + j + 2), _mm_load_pd(b + j + 2))); 206 _mm_stream_pd(c + j + 4, _mm_add_pd(_mm_load_pd(a + j + 4), _mm_load_pd(b + j + 4))); 207 _mm_stream_pd(c + j + 6, _mm_add_pd(_mm_load_pd(a + j + 6), _mm_load_pd(b + j + 6))); 208 #if PREFETCH_NTA 209 _mm_prefetch(a + j + 64, _MM_HINT_NTA); 210 _mm_prefetch(b + j + 64, _MM_HINT_NTA); 211 #endif 212 } 213 } 214 #else 215 for (j = 0; j < N; j++) c[j] = a[j] + b[j]; 216 #endif 217 MPI_Barrier(PETSC_COMM_WORLD); 218 times[2][k] = Second() - times[2][k]; 219 220 /* ### TRIAD: a <- b + scalar * c ### */ 221 times[3][k] = Second(); 222 MPI_Barrier(PETSC_COMM_WORLD); 223 #if SSE2 224 { 225 __m128d scalar2 = _mm_set1_pd(scalar); 226 for (j = 0; j < N; j += 8) { 227 _mm_stream_pd(a + j + 0, _mm_add_pd(_mm_load_pd(b + j + 0), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0)))); 228 _mm_stream_pd(a + j + 2, _mm_add_pd(_mm_load_pd(b + j + 2), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2)))); 229 _mm_stream_pd(a + j + 4, _mm_add_pd(_mm_load_pd(b + j + 4), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4)))); 230 _mm_stream_pd(a + j + 6, _mm_add_pd(_mm_load_pd(b + j + 6), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6)))); 231 #if PREFETCH_NTA 232 _mm_prefetch(b + j + 64, _MM_HINT_NTA); 233 _mm_prefetch(c + j + 64, _MM_HINT_NTA); 234 #endif 235 } 236 } 237 #else 238 for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j]; 239 #endif 240 MPI_Barrier(PETSC_COMM_WORLD); 241 times[3][k] = Second() - times[3][k]; 242 } 243 244 /* --- SUMMARY --- */ 245 246 for (k = 0; k < NTIMES; k++) 247 for (j = 0; j < 4; j++) { 248 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]); 249 mintime[j] = MIN(mintime[j], times[j][k]); 250 maxtime[j] = MAX(maxtime[j], times[j][k]); 251 } 252 253 PetscPrintf(PETSC_COMM_WORLD, "%8s: %11s %11s %11s %11s %11s\n", "Function", "Rate (MB/s)", "Total (MB/s)", "RMS time", "Min time", "Max time"); 254 for (j = 0; j < 4; j++) { 255 rmstime[j] = sqrt(rmstime[j] / (double)NTIMES); 256 PetscPrintf(PETSC_COMM_WORLD, "%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06 * bytes[j] / mintime[j], size * 1.0e-06 * bytes[j] / mintime[j], rmstime[j], mintime[j], maxtime[j]); 257 } 258 PetscCall(PetscFinalize()); 259 return 0; 260 } 261 262 static double Second() 263 { 264 double t; 265 PetscTime(&t); 266 return t; 267 } 268 269 #define M 20 270 static int checktick(void) 271 { 272 int i, minDelta, Delta; 273 double t1, t2, timesfound[M]; 274 275 /* Collect a sequence of M unique time values from the system. */ 276 277 for (i = 0; i < M; i++) { 278 t1 = Second(); 279 while ((t2 = Second()) - t1 < 1.0E-6) { } 280 timesfound[i] = t1 = t2; 281 } 282 283 /* 284 * Determine the minimum difference between these M values. 285 * This result will be our estimate (in microseconds) for the 286 * clock granularity. 287 */ 288 289 minDelta = 1000000; 290 for (i = 1; i < M; i++) { 291 Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1])); 292 minDelta = MIN(minDelta, MAX(Delta, 0)); 293 } 294 295 return minDelta; 296 } 297