1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n";
2
3 /* Note: this file has been modified significantly from its original version */
4 #include <emmintrin.h>
5 #include <petsctime.h>
6 #include <petscsys.h>
7 #if defined(HAVE_NUMA)
8 #include <numa.h>
9 #endif
10 #include <float.h>
11
12 #if !defined(SSE2)
13 #define SSE2 1
14 #endif
15 #if !defined(__SSE2__)
16 #error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0
17 #endif
18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */
19 #define PREFETCH_NTA 1
20 #endif
21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */
22 #define STATIC_ALLOC 0
23 #endif
24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */
25 #define FAULT_TOGETHER 0
26 #endif
27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */
28 #define USE_MEMCPY 0
29 #endif
30
31 /*
32 * Program: Stream
33 * Programmer: Joe R. Zagar
34 * Revision: 4.0-BETA, October 24, 1995
35 * Original code developed by John D. McCalpin
36 *
37 * This program measures memory transfer rates in MB/s for simple
38 * computational kernels coded in C. These numbers reveal the quality
39 * of code generation for simple uncacheable kernels as well as showing
40 * the cost of floating-point operations relative to memory accesses.
41 *
42 * INSTRUCTIONS:
43 *
44 * 1) Stream requires a good bit of memory to run. Adjust the
45 * value of 'N' (below) to give a 'timing calibration' of
46 * at least 20 clock-ticks. This will provide rate estimates
47 * that should be good to about 5% precision.
48 */
49
50 #define N 4000000
51 #define NTIMES 100
52 #define OFFSET 0
53
54 #define HLINE "-------------------------------------------------------------\n"
55
56 #if !defined(MIN)
57 #define MIN(x, y) ((x) < (y) ? (x) : (y))
58 #endif
59 #if !defined(MAX)
60 #define MAX(x, y) ((x) > (y) ? (x) : (y))
61 #endif
62
63 #if STATIC_ALLOC
64 double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET];
65 #endif
66
67 static int checktick(void);
68 static double Second(void);
69
main(int argc,char * argv[])70 int main(int argc, char *argv[])
71 {
72 const char *label[4] = {"Copy", "Scale", "Add", "Triad"};
73 const double bytes[4] = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N};
74 double rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
75 int quantum;
76 int BytesPerWord, j, k, size;
77 PetscInt node = -1;
78 double scalar, t, times[4][NTIMES];
79 #if !STATIC_ALLOC
80 double *PETSC_RESTRICT a, *PETSC_RESTRICT b, *PETSC_RESTRICT c;
81 #endif
82
83 PetscCall(PetscInitialize(&argc, &argv, 0, help));
84 PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
85 PetscCall(PetscOptionsGetInt(NULL, NULL, "-node", &node, NULL));
86 /* --- SETUP --- determine precision and check timing --- */
87
88 PetscPrintf(PETSC_COMM_WORLD, HLINE);
89 BytesPerWord = sizeof(double);
90 PetscPrintf(PETSC_COMM_WORLD, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord);
91
92 PetscPrintf(PETSC_COMM_WORLD, HLINE);
93 PetscPrintf(PETSC_COMM_WORLD, "Array size = %d, Offset = %d\n", N, OFFSET);
94 PetscPrintf(PETSC_COMM_WORLD, "Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0);
95 PetscPrintf(PETSC_COMM_WORLD, "Each test is run %d times, but only\n", NTIMES);
96 PetscPrintf(PETSC_COMM_WORLD, "the *best* time for each is used.\n");
97
98 /* Get initial value for system clock. */
99
100 #if !STATIC_ALLOC
101 if (node == -1) {
102 posix_memalign((void **)&a, 64, N * sizeof(double));
103 posix_memalign((void **)&b, 64, N * sizeof(double));
104 posix_memalign((void **)&c, 64, N * sizeof(double));
105 } else if (node == -2) {
106 a = malloc(N * sizeof(double));
107 b = malloc(N * sizeof(double));
108 c = malloc(N * sizeof(double));
109 #if defined(HAVE_NUMA)
110 } else {
111 a = numa_alloc_onnode(N * sizeof(double), node);
112 b = numa_alloc_onnode(N * sizeof(double), node);
113 c = numa_alloc_onnode(N * sizeof(double), node);
114 #endif
115 }
116 #endif
117 #if FAULT_TOGETHER
118 for (j = 0; j < N; j++) {
119 a[j] = 1.0;
120 b[j] = 2.0;
121 c[j] = 0.0;
122 }
123 #else
124 for (j = 0; j < N; j++) a[j] = 1.0;
125 for (j = 0; j < N; j++) b[j] = 2.0;
126 for (j = 0; j < N; j++) c[j] = 0.0;
127 #endif
128
129 PetscPrintf(PETSC_COMM_WORLD, HLINE);
130
131 if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity/precision appears to be %d microseconds.\n", quantum);
132 else PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity appears to be less than one microsecond.\n");
133
134 t = Second();
135 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
136 t = 1.0E6 * (Second() - t);
137
138 PetscPrintf(PETSC_COMM_WORLD, "Each test below will take on the order of %d microseconds.\n", (int)t);
139 PetscPrintf(PETSC_COMM_WORLD, " (= %d clock ticks)\n", (int)(t / quantum));
140 PetscPrintf(PETSC_COMM_WORLD, "Increase the size of the arrays if this shows that\n");
141 PetscPrintf(PETSC_COMM_WORLD, "you are not getting at least 20 clock ticks per test.\n");
142
143 PetscPrintf(PETSC_COMM_WORLD, HLINE);
144
145 PetscPrintf(PETSC_COMM_WORLD, "WARNING -- The above is only a rough guideline.\n");
146 PetscPrintf(PETSC_COMM_WORLD, "For best results, please be sure you know the\n");
147 PetscPrintf(PETSC_COMM_WORLD, "precision of your system timer.\n");
148 PetscPrintf(PETSC_COMM_WORLD, HLINE);
149
150 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
151
152 scalar = 3.0;
153 for (k = 0; k < NTIMES; k++) {
154 MPI_Barrier(PETSC_COMM_WORLD);
155 /* ### COPY: c <- a ### */
156 times[0][k] = Second();
157 MPI_Barrier(PETSC_COMM_WORLD);
158 #if USE_MEMCPY
159 memcpy(c, a, N * sizeof(double));
160 #elif SSE2
161 for (j = 0; j < N; j += 8) {
162 _mm_stream_pd(c + j + 0, _mm_load_pd(a + j + 0));
163 _mm_stream_pd(c + j + 2, _mm_load_pd(a + j + 2));
164 _mm_stream_pd(c + j + 4, _mm_load_pd(a + j + 4));
165 _mm_stream_pd(c + j + 6, _mm_load_pd(a + j + 6));
166 #if PREFETCH_NTA
167 _mm_prefetch(a + j + 64, _MM_HINT_NTA);
168 #endif
169 }
170 #else
171 for (j = 0; j < N; j++) c[j] = a[j];
172 #endif
173 MPI_Barrier(PETSC_COMM_WORLD);
174 times[0][k] = Second() - times[0][k];
175
176 /* ### SCALE: b <- scalar * c ### */
177 times[1][k] = Second();
178 MPI_Barrier(PETSC_COMM_WORLD);
179 #if SSE2
180 {
181 __m128d scalar2 = _mm_set1_pd(scalar);
182 for (j = 0; j < N; j += 8) {
183 _mm_stream_pd(b + j + 0, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0)));
184 _mm_stream_pd(b + j + 2, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2)));
185 _mm_stream_pd(b + j + 4, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4)));
186 _mm_stream_pd(b + j + 6, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6)));
187 #if PREFETCH_NTA
188 _mm_prefetch(c + j + 64, _MM_HINT_NTA);
189 #endif
190 }
191 }
192 #else
193 for (j = 0; j < N; j++) b[j] = scalar * c[j];
194 #endif
195 MPI_Barrier(PETSC_COMM_WORLD);
196 times[1][k] = Second() - times[1][k];
197
198 /* ### ADD: c <- a + b ### */
199 times[2][k] = Second();
200 MPI_Barrier(PETSC_COMM_WORLD);
201 #if SSE2
202 {
203 for (j = 0; j < N; j += 8) {
204 _mm_stream_pd(c + j + 0, _mm_add_pd(_mm_load_pd(a + j + 0), _mm_load_pd(b + j + 0)));
205 _mm_stream_pd(c + j + 2, _mm_add_pd(_mm_load_pd(a + j + 2), _mm_load_pd(b + j + 2)));
206 _mm_stream_pd(c + j + 4, _mm_add_pd(_mm_load_pd(a + j + 4), _mm_load_pd(b + j + 4)));
207 _mm_stream_pd(c + j + 6, _mm_add_pd(_mm_load_pd(a + j + 6), _mm_load_pd(b + j + 6)));
208 #if PREFETCH_NTA
209 _mm_prefetch(a + j + 64, _MM_HINT_NTA);
210 _mm_prefetch(b + j + 64, _MM_HINT_NTA);
211 #endif
212 }
213 }
214 #else
215 for (j = 0; j < N; j++) c[j] = a[j] + b[j];
216 #endif
217 MPI_Barrier(PETSC_COMM_WORLD);
218 times[2][k] = Second() - times[2][k];
219
220 /* ### TRIAD: a <- b + scalar * c ### */
221 times[3][k] = Second();
222 MPI_Barrier(PETSC_COMM_WORLD);
223 #if SSE2
224 {
225 __m128d scalar2 = _mm_set1_pd(scalar);
226 for (j = 0; j < N; j += 8) {
227 _mm_stream_pd(a + j + 0, _mm_add_pd(_mm_load_pd(b + j + 0), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0))));
228 _mm_stream_pd(a + j + 2, _mm_add_pd(_mm_load_pd(b + j + 2), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2))));
229 _mm_stream_pd(a + j + 4, _mm_add_pd(_mm_load_pd(b + j + 4), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4))));
230 _mm_stream_pd(a + j + 6, _mm_add_pd(_mm_load_pd(b + j + 6), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6))));
231 #if PREFETCH_NTA
232 _mm_prefetch(b + j + 64, _MM_HINT_NTA);
233 _mm_prefetch(c + j + 64, _MM_HINT_NTA);
234 #endif
235 }
236 }
237 #else
238 for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j];
239 #endif
240 MPI_Barrier(PETSC_COMM_WORLD);
241 times[3][k] = Second() - times[3][k];
242 }
243
244 /* --- SUMMARY --- */
245
246 for (k = 0; k < NTIMES; k++)
247 for (j = 0; j < 4; j++) {
248 rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
249 mintime[j] = MIN(mintime[j], times[j][k]);
250 maxtime[j] = MAX(maxtime[j], times[j][k]);
251 }
252
253 PetscPrintf(PETSC_COMM_WORLD, "%8s: %11s %11s %11s %11s %11s\n", "Function", "Rate (MB/s)", "Total (MB/s)", "RMS time", "Min time", "Max time");
254 for (j = 0; j < 4; j++) {
255 rmstime[j] = sqrt(rmstime[j] / (double)NTIMES);
256 PetscPrintf(PETSC_COMM_WORLD, "%8s: %11.4f %11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0e-06 * bytes[j] / mintime[j], size * 1.0e-06 * bytes[j] / mintime[j], rmstime[j], mintime[j], maxtime[j]);
257 }
258 PetscCall(PetscFinalize());
259 return 0;
260 }
261
Second()262 static double Second()
263 {
264 double t;
265 PetscTime(&t);
266 return t;
267 }
268
269 #define M 20
checktick(void)270 static int checktick(void)
271 {
272 int i, minDelta, Delta;
273 double t1, t2, timesfound[M];
274
275 /* Collect a sequence of M unique time values from the system. */
276
277 for (i = 0; i < M; i++) {
278 t1 = Second();
279 while ((t2 = Second()) - t1 < 1.0E-6) { }
280 timesfound[i] = t1 = t2;
281 }
282
283 /*
284 * Determine the minimum difference between these M values.
285 * This result will be our estimate (in microseconds) for the
286 * clock granularity.
287 */
288
289 minDelta = 1000000;
290 for (i = 1; i < M; i++) {
291 Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1]));
292 minDelta = MIN(minDelta, MAX(Delta, 0));
293 }
294
295 return minDelta;
296 }
297