19f0612e4SBarry Smith /*
29f0612e4SBarry Smith A simplification of the Stream benchmark for OpenMP
39f0612e4SBarry Smith The array for each thread is a large distance from the array for the other threads
49f0612e4SBarry Smith Original code developed by John D. McCalpin
59f0612e4SBarry Smith */
69f0612e4SBarry Smith #include <stdio.h>
79f0612e4SBarry Smith #include <math.h>
89f0612e4SBarry Smith #include <limits.h>
99f0612e4SBarry Smith #include <float.h>
109f0612e4SBarry Smith #include <sys/time.h>
119f0612e4SBarry Smith #include <stdlib.h>
129f0612e4SBarry Smith #include <omp.h>
139f0612e4SBarry Smith #include <petscsys.h>
149f0612e4SBarry Smith
159f0612e4SBarry Smith #define NTIMESINNER 1
169f0612e4SBarry Smith #define N 2 * 4 * 20000000
179f0612e4SBarry Smith //#define N 1200000
189f0612e4SBarry Smith //#define N 120000
199f0612e4SBarry Smith #define NTIMES 50
209f0612e4SBarry Smith #define OFFSET 0
219f0612e4SBarry Smith
229f0612e4SBarry Smith #if !defined(MIN)
239f0612e4SBarry Smith #define MIN(x, y) ((x) < (y) ? (x) : (y))
249f0612e4SBarry Smith #endif
259f0612e4SBarry Smith #if !defined(MAX)
269f0612e4SBarry Smith #define MAX(x, y) ((x) > (y) ? (x) : (y))
279f0612e4SBarry Smith #endif
289f0612e4SBarry Smith
299f0612e4SBarry Smith static double a[64][10000000], b[64][10000000], c[64][10000000];
309f0612e4SBarry Smith static double mintime = FLT_MAX;
319f0612e4SBarry Smith static double bytes = 3 * sizeof(double) * N;
329f0612e4SBarry Smith
main()339f0612e4SBarry Smith int main()
349f0612e4SBarry Smith {
359f0612e4SBarry Smith const static double scalar = 3.0;
369f0612e4SBarry Smith #pragma omp threadprivate(scalar)
379f0612e4SBarry Smith double times[NTIMES], rate;
389f0612e4SBarry Smith int size;
399f0612e4SBarry Smith static int n;
409f0612e4SBarry Smith #pragma omp threadprivate(n)
419f0612e4SBarry Smith char *env;
429f0612e4SBarry Smith FILE *fd;
439f0612e4SBarry Smith
449f0612e4SBarry Smith env = getenv("OMP_NUM_THREADS");
459f0612e4SBarry Smith if (!env) env = (char *)"1";
469f0612e4SBarry Smith sscanf(env, "%d", &size);
479f0612e4SBarry Smith
489f0612e4SBarry Smith #pragma omp parallel for schedule(static)
499f0612e4SBarry Smith for (int j = 0; j < size; j++) {
509f0612e4SBarry Smith n = (N / size + ((N % size) > omp_get_thread_num()));
519f0612e4SBarry Smith for (int i = 0; i < n; i++) {
529f0612e4SBarry Smith a[j][i] = 1.0;
539f0612e4SBarry Smith b[j][i] = 2.0;
549f0612e4SBarry Smith c[j][i] = 3.0;
559f0612e4SBarry Smith }
569f0612e4SBarry Smith }
579f0612e4SBarry Smith
589f0612e4SBarry Smith /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
59*67595998SJunchao Zhang for (int k = 0; k < NTIMES; k++) {
609f0612e4SBarry Smith times[k] = MPI_Wtime();
619f0612e4SBarry Smith // https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-2.pdf
629f0612e4SBarry Smith // #pragma omp parallel for (same performance as below)
639f0612e4SBarry Smith // #pragma omp parallel for simd schedule(static) (same performance as below)
649f0612e4SBarry Smith #pragma omp parallel for schedule(static)
659f0612e4SBarry Smith for (int j = 0; j < size; j++) {
669f0612e4SBarry Smith n = (N / size + ((N % size) > omp_get_thread_num()));
679f0612e4SBarry Smith double *aa = a[j]; // these don't change the timings
689f0612e4SBarry Smith const double *bb = b[j];
699f0612e4SBarry Smith const double *cc = c[j];
709f0612e4SBarry Smith for (int l = 0; l < NTIMESINNER; l++) {
719f0612e4SBarry Smith for (register int i = 0; i < n; i++) aa[i] = bb[i] + scalar * cc[i];
729f0612e4SBarry Smith if (size == 65) printf("never printed %g\n", a[0][11]);
739f0612e4SBarry Smith }
749f0612e4SBarry Smith }
759f0612e4SBarry Smith times[k] = MPI_Wtime() - times[k];
769f0612e4SBarry Smith }
779f0612e4SBarry Smith for (int k = 1; k < NTIMES; k++) { /* note -- skip first iteration */
789f0612e4SBarry Smith mintime = MIN(mintime, times[k]);
799f0612e4SBarry Smith }
809f0612e4SBarry Smith rate = 1.0E-06 * bytes * NTIMESINNER / mintime;
819f0612e4SBarry Smith
829f0612e4SBarry Smith if (size == 1) {
839f0612e4SBarry Smith printf("%d %11.4f Rate (MB/s) 1\n", size, rate);
849f0612e4SBarry Smith fd = fopen("flops", "w");
859f0612e4SBarry Smith fprintf(fd, "%g\n", rate);
869f0612e4SBarry Smith fclose(fd);
879f0612e4SBarry Smith } else {
889f0612e4SBarry Smith double prate;
899f0612e4SBarry Smith fd = fopen("flops", "r");
909f0612e4SBarry Smith fscanf(fd, "%lg", &prate);
919f0612e4SBarry Smith fclose(fd);
929f0612e4SBarry Smith printf("%d %11.4f Rate (MB/s) %g\n", size, rate, rate / prate);
939f0612e4SBarry Smith }
949f0612e4SBarry Smith return 0;
959f0612e4SBarry Smith }
96