xref: /petsc/src/benchmarks/streams/SSEVersion.c (revision ccfb0f9f40a0131988d7995ed9679700dae2a75a)
1 static const char help[] = "STREAM benchmark specialized for SSE2\n\\n";
2 
3 /* Note: this file has been modified significantly from its original version */
4 #include <emmintrin.h>
5 #include <petsctime.h>
6 #include <petscsys.h>
7 #if defined(HAVE_NUMA)
8   #include <numa.h>
9 #endif
10 #include <float.h>
11 
12 #if !defined(SSE2)
13   #define SSE2 1
14 #endif
15 #if !defined(__SSE2__)
16   #error SSE2 instruction set is not enabled, try adding -march=native to CFLAGS or disable by adding -DSSE2=0
17 #endif
18 #if !defined(PREFETCH_NTA) /* Use software prefetch and set non-temporal policy so that lines evicted from L1D will not subsequently reside in L2 or L3. */
19   #define PREFETCH_NTA 1
20 #endif
21 #if !defined(STATIC_ALLOC) /* Statically allocate the vectors. Most platforms do not find physical pages when memory is allocated, therefore the faulting strategy still affects performance. */
22   #define STATIC_ALLOC 0
23 #endif
24 #if !defined(FAULT_TOGETHER) /* Faults all three vectors together which usually interleaves DRAM pages in physical memory. */
25   #define FAULT_TOGETHER 0
26 #endif
27 #if !defined(USE_MEMCPY) /* Literally call memcpy(3) for the COPY benchmark. Some compilers detect the unoptimized loop as memcpy and call this anyway. */
28   #define USE_MEMCPY 0
29 #endif
30 
31 /*
32  * Program: Stream
33  * Programmer: Joe R. Zagar
34  * Revision: 4.0-BETA, October 24, 1995
35  * Original code developed by John D. McCalpin
36  *
37  * This program measures memory transfer rates in MB/s for simple
38  * computational kernels coded in C.  These numbers reveal the quality
39  * of code generation for simple uncacheable kernels as well as showing
40  * the cost of floating-point operations relative to memory accesses.
41  *
42  * INSTRUCTIONS:
43  *
44  *       1) Stream requires a good bit of memory to run.  Adjust the
45  *          value of 'N' (below) to give a 'timing calibration' of
46  *          at least 20 clock-ticks.  This will provide rate estimates
47  *          that should be good to about 5% precision.
48  */
49 
50 #define N      4000000
51 #define NTIMES 100
52 #define OFFSET 0
53 
54 #define HLINE "-------------------------------------------------------------\n"
55 
56 #if !defined(MIN)
57   #define MIN(x, y) ((x) < (y) ? (x) : (y))
58 #endif
59 #if !defined(MAX)
60   #define MAX(x, y) ((x) > (y) ? (x) : (y))
61 #endif
62 
63 #if STATIC_ALLOC
64 double a[N + OFFSET], b[N + OFFSET], c[N + OFFSET];
65 #endif
66 
67 static int    checktick(void);
68 static double Second(void);
69 
70 int main(int argc, char *argv[])
71 {
72   const char  *label[4]   = {"Copy", "Scale", "Add", "Triad"};
73   const double bytes[4]   = {2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N};
74   double       rmstime[4] = {0}, maxtime[4] = {0}, mintime[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
75   int          quantum;
76   int          BytesPerWord, j, k, size;
77   PetscInt     node = -1;
78   double       scalar, t, times[4][NTIMES];
79 #if !STATIC_ALLOC
80   double *PETSC_RESTRICT a, *PETSC_RESTRICT b, *PETSC_RESTRICT c;
81 #endif
82 
83   PetscCall(PetscInitialize(&argc, &argv, 0, help));
84   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
85   PetscCall(PetscOptionsGetInt(NULL, NULL, "-node", &node, NULL));
86   /* --- SETUP --- determine precision and check timing --- */
87 
88   PetscPrintf(PETSC_COMM_WORLD, HLINE);
89   BytesPerWord = sizeof(double);
90   PetscPrintf(PETSC_COMM_WORLD, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord);
91 
92   PetscPrintf(PETSC_COMM_WORLD, HLINE);
93   PetscPrintf(PETSC_COMM_WORLD, "Array size = %d, Offset = %d\n", N, OFFSET);
94   PetscPrintf(PETSC_COMM_WORLD, "Total memory required = %.1f MB per process.\n", (3 * N * BytesPerWord) / 1048576.0);
95   PetscPrintf(PETSC_COMM_WORLD, "Each test is run %d times, but only\n", NTIMES);
96   PetscPrintf(PETSC_COMM_WORLD, "the *best* time for each is used.\n");
97 
98   /* Get initial value for system clock. */
99 
100 #if !STATIC_ALLOC
101   if (node == -1) {
102     posix_memalign((void **)&a, 64, N * sizeof(double));
103     posix_memalign((void **)&b, 64, N * sizeof(double));
104     posix_memalign((void **)&c, 64, N * sizeof(double));
105   } else if (node == -2) {
106     a = malloc(N * sizeof(double));
107     b = malloc(N * sizeof(double));
108     c = malloc(N * sizeof(double));
109   #if defined(HAVE_NUMA)
110   } else {
111     a = numa_alloc_onnode(N * sizeof(double), node);
112     b = numa_alloc_onnode(N * sizeof(double), node);
113     c = numa_alloc_onnode(N * sizeof(double), node);
114   #endif
115   }
116 #endif
117 #if FAULT_TOGETHER
118   for (j = 0; j < N; j++) {
119     a[j] = 1.0;
120     b[j] = 2.0;
121     c[j] = 0.0;
122   }
123 #else
124   for (j = 0; j < N; j++) a[j] = 1.0;
125   for (j = 0; j < N; j++) b[j] = 2.0;
126   for (j = 0; j < N; j++) c[j] = 0.0;
127 #endif
128 
129   PetscPrintf(PETSC_COMM_WORLD, HLINE);
130 
131   if ((quantum = checktick()) >= 1) PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity/precision appears to be %d microseconds.\n", quantum);
132   else PetscPrintf(PETSC_COMM_WORLD, "Your clock granularity appears to be less than one microsecond.\n");
133 
134   t = Second();
135   for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
136   t = 1.0E6 * (Second() - t);
137 
138   PetscPrintf(PETSC_COMM_WORLD, "Each test below will take on the order of %d microseconds.\n", (int)t);
139   PetscPrintf(PETSC_COMM_WORLD, "   (= %d clock ticks)\n", (int)(t / quantum));
140   PetscPrintf(PETSC_COMM_WORLD, "Increase the size of the arrays if this shows that\n");
141   PetscPrintf(PETSC_COMM_WORLD, "you are not getting at least 20 clock ticks per test.\n");
142 
143   PetscPrintf(PETSC_COMM_WORLD, HLINE);
144 
145   PetscPrintf(PETSC_COMM_WORLD, "WARNING -- The above is only a rough guideline.\n");
146   PetscPrintf(PETSC_COMM_WORLD, "For best results, please be sure you know the\n");
147   PetscPrintf(PETSC_COMM_WORLD, "precision of your system timer.\n");
148   PetscPrintf(PETSC_COMM_WORLD, HLINE);
149 
150   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
151 
152   scalar = 3.0;
153   for (k = 0; k < NTIMES; k++) {
154     MPI_Barrier(PETSC_COMM_WORLD);
155     /* ### COPY: c <- a ### */
156     times[0][k] = Second();
157     MPI_Barrier(PETSC_COMM_WORLD);
158 #if USE_MEMCPY
159     memcpy(c, a, N * sizeof(double));
160 #elif SSE2
161     for (j = 0; j < N; j += 8) {
162       _mm_stream_pd(c + j + 0, _mm_load_pd(a + j + 0));
163       _mm_stream_pd(c + j + 2, _mm_load_pd(a + j + 2));
164       _mm_stream_pd(c + j + 4, _mm_load_pd(a + j + 4));
165       _mm_stream_pd(c + j + 6, _mm_load_pd(a + j + 6));
166   #if PREFETCH_NTA
167       _mm_prefetch(a + j + 64, _MM_HINT_NTA);
168   #endif
169     }
170 #else
171     for (j = 0; j < N; j++) c[j] = a[j];
172 #endif
173     MPI_Barrier(PETSC_COMM_WORLD);
174     times[0][k] = Second() - times[0][k];
175 
176     /* ### SCALE: b <- scalar * c ### */
177     times[1][k] = Second();
178     MPI_Barrier(PETSC_COMM_WORLD);
179 #if SSE2
180     {
181       __m128d scalar2 = _mm_set1_pd(scalar);
182       for (j = 0; j < N; j += 8) {
183         _mm_stream_pd(b + j + 0, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0)));
184         _mm_stream_pd(b + j + 2, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2)));
185         _mm_stream_pd(b + j + 4, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4)));
186         _mm_stream_pd(b + j + 6, _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6)));
187   #if PREFETCH_NTA
188         _mm_prefetch(c + j + 64, _MM_HINT_NTA);
189   #endif
190       }
191     }
192 #else
193     for (j = 0; j < N; j++) b[j] = scalar * c[j];
194 #endif
195     MPI_Barrier(PETSC_COMM_WORLD);
196     times[1][k] = Second() - times[1][k];
197 
198     /* ### ADD: c <- a + b ### */
199     times[2][k] = Second();
200     MPI_Barrier(PETSC_COMM_WORLD);
201 #if SSE2
202     {
203       for (j = 0; j < N; j += 8) {
204         _mm_stream_pd(c + j + 0, _mm_add_pd(_mm_load_pd(a + j + 0), _mm_load_pd(b + j + 0)));
205         _mm_stream_pd(c + j + 2, _mm_add_pd(_mm_load_pd(a + j + 2), _mm_load_pd(b + j + 2)));
206         _mm_stream_pd(c + j + 4, _mm_add_pd(_mm_load_pd(a + j + 4), _mm_load_pd(b + j + 4)));
207         _mm_stream_pd(c + j + 6, _mm_add_pd(_mm_load_pd(a + j + 6), _mm_load_pd(b + j + 6)));
208   #if PREFETCH_NTA
209         _mm_prefetch(a + j + 64, _MM_HINT_NTA);
210         _mm_prefetch(b + j + 64, _MM_HINT_NTA);
211   #endif
212       }
213     }
214 #else
215     for (j = 0; j < N; j++) c[j] = a[j] + b[j];
216 #endif
217     MPI_Barrier(PETSC_COMM_WORLD);
218     times[2][k] = Second() - times[2][k];
219 
220     /* ### TRIAD: a <- b + scalar * c ### */
221     times[3][k] = Second();
222     MPI_Barrier(PETSC_COMM_WORLD);
223 #if SSE2
224     {
225       __m128d scalar2 = _mm_set1_pd(scalar);
226       for (j = 0; j < N; j += 8) {
227         _mm_stream_pd(a + j + 0, _mm_add_pd(_mm_load_pd(b + j + 0), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 0))));
228         _mm_stream_pd(a + j + 2, _mm_add_pd(_mm_load_pd(b + j + 2), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 2))));
229         _mm_stream_pd(a + j + 4, _mm_add_pd(_mm_load_pd(b + j + 4), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 4))));
230         _mm_stream_pd(a + j + 6, _mm_add_pd(_mm_load_pd(b + j + 6), _mm_mul_pd(scalar2, _mm_load_pd(c + j + 6))));
231   #if PREFETCH_NTA
232         _mm_prefetch(b + j + 64, _MM_HINT_NTA);
233         _mm_prefetch(c + j + 64, _MM_HINT_NTA);
234   #endif
235       }
236     }
237 #else
238     for (j = 0; j < N; j++) a[j] = b[j] + scalar * c[j];
239 #endif
240     MPI_Barrier(PETSC_COMM_WORLD);
241     times[3][k] = Second() - times[3][k];
242   }
243 
244   /* --- SUMMARY --- */
245 
246   for (k = 0; k < NTIMES; k++)
247     for (j = 0; j < 4; j++) {
248       rmstime[j] = rmstime[j] + (times[j][k] * times[j][k]);
249       mintime[j] = MIN(mintime[j], times[j][k]);
250       maxtime[j] = MAX(maxtime[j], times[j][k]);
251     }
252 
253   PetscPrintf(PETSC_COMM_WORLD, "%8s:  %11s  %11s  %11s  %11s  %11s\n", "Function", "Rate (MB/s)", "Total (MB/s)", "RMS time", "Min time", "Max time");
254   for (j = 0; j < 4; j++) {
255     rmstime[j] = sqrt(rmstime[j] / (double)NTIMES);
256     PetscPrintf(PETSC_COMM_WORLD, "%8s: %11.4f  %11.4f  %11.4f  %11.4f  %11.4f\n", label[j], 1.0e-06 * bytes[j] / mintime[j], size * 1.0e-06 * bytes[j] / mintime[j], rmstime[j], mintime[j], maxtime[j]);
257   }
258   PetscCall(PetscFinalize());
259   return 0;
260 }
261 
262 static double Second()
263 {
264   double t;
265   PetscTime(&t);
266   return t;
267 }
268 
269 #define M 20
270 static int checktick(void)
271 {
272   int    i, minDelta, Delta;
273   double t1, t2, timesfound[M];
274 
275   /*  Collect a sequence of M unique time values from the system. */
276 
277   for (i = 0; i < M; i++) {
278     t1 = Second();
279     while ((t2 = Second()) - t1 < 1.0E-6) { }
280     timesfound[i] = t1 = t2;
281   }
282 
283   /*
284    * Determine the minimum difference between these M values.
285    * This result will be our estimate (in microseconds) for the
286    * clock granularity.
287    */
288 
289   minDelta = 1000000;
290   for (i = 1; i < M; i++) {
291     Delta    = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1]));
292     minDelta = MIN(minDelta, MAX(Delta, 0));
293   }
294 
295   return minDelta;
296 }
297