xref: /petsc/src/benchmarks/streams/OpenMPVersionLikeMPI.c (revision 9f0612e409f6220a780be6348417bea34ef34962)
1*9f0612e4SBarry Smith /*
2*9f0612e4SBarry Smith   A simplification of the Stream benchmark for OpenMP
3*9f0612e4SBarry Smith   The array for each thread is a large distance from the array for the other threads
4*9f0612e4SBarry Smith   Original code developed by John D. McCalpin
5*9f0612e4SBarry Smith */
6*9f0612e4SBarry Smith #include <stdio.h>
7*9f0612e4SBarry Smith #include <math.h>
8*9f0612e4SBarry Smith #include <limits.h>
9*9f0612e4SBarry Smith #include <float.h>
10*9f0612e4SBarry Smith #include <sys/time.h>
11*9f0612e4SBarry Smith #include <stdlib.h>
12*9f0612e4SBarry Smith #include <omp.h>
13*9f0612e4SBarry Smith #include <petscsys.h>
14*9f0612e4SBarry Smith 
15*9f0612e4SBarry Smith #define NTIMESINNER 1
16*9f0612e4SBarry Smith #define N 2*4*20000000
17*9f0612e4SBarry Smith //#define N 1200000
18*9f0612e4SBarry Smith //#define N 120000
19*9f0612e4SBarry Smith #define NTIMES       50
20*9f0612e4SBarry Smith #define OFFSET       0
21*9f0612e4SBarry Smith 
22*9f0612e4SBarry Smith # if !defined(MIN)
23*9f0612e4SBarry Smith # define MIN(x,y) ((x)<(y) ? (x) : (y))
24*9f0612e4SBarry Smith # endif
25*9f0612e4SBarry Smith # if !defined(MAX)
26*9f0612e4SBarry Smith # define MAX(x,y) ((x)>(y) ? (x) : (y))
27*9f0612e4SBarry Smith # endif
28*9f0612e4SBarry Smith 
29*9f0612e4SBarry Smith static double a[64][10000000],b[64][10000000],c[64][10000000];
30*9f0612e4SBarry Smith static double  mintime = FLT_MAX;
31*9f0612e4SBarry Smith static double bytes =  3 * sizeof(double) * N;
32*9f0612e4SBarry Smith 
33*9f0612e4SBarry Smith int main()
34*9f0612e4SBarry Smith {
35*9f0612e4SBarry Smith   const static double scalar = 3.0;
36*9f0612e4SBarry Smith #pragma omp threadprivate(scalar)
37*9f0612e4SBarry Smith   double       times[NTIMES],rate;
38*9f0612e4SBarry Smith   int          size;
39*9f0612e4SBarry Smith   static int   n;
40*9f0612e4SBarry Smith #pragma omp threadprivate(n)
41*9f0612e4SBarry Smith   char         *env;
42*9f0612e4SBarry Smith   FILE         *fd;
43*9f0612e4SBarry Smith 
44*9f0612e4SBarry Smith   env = getenv("OMP_NUM_THREADS");
45*9f0612e4SBarry Smith   if (!env) env = (char *) "1";
46*9f0612e4SBarry Smith   sscanf(env,"%d",&size);
47*9f0612e4SBarry Smith 
48*9f0612e4SBarry Smith #pragma omp parallel for schedule(static)
49*9f0612e4SBarry Smith   for (int j=0; j<size; j++) {
50*9f0612e4SBarry Smith     n = (N / size + ((N % size) > omp_get_thread_num()));
51*9f0612e4SBarry Smith     for (int i=0; i<n; i++){
52*9f0612e4SBarry Smith       a[j][i] = 1.0;
53*9f0612e4SBarry Smith       b[j][i] = 2.0;
54*9f0612e4SBarry Smith       c[j][i] = 3.0;
55*9f0612e4SBarry Smith     }
56*9f0612e4SBarry Smith   }
57*9f0612e4SBarry Smith 
58*9f0612e4SBarry Smith   /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */
59*9f0612e4SBarry Smith   for (int k=0; k<NTIMES; k++)
60*9f0612e4SBarry Smith   {
61*9f0612e4SBarry Smith     times[k] = MPI_Wtime();
62*9f0612e4SBarry Smith     // https://www.openmp.org/wp-content/uploads/OpenMP-API-Specification-5-2.pdf
63*9f0612e4SBarry Smith // #pragma omp parallel for  (same performance as below)
64*9f0612e4SBarry Smith // #pragma omp parallel for simd schedule(static)  (same performance as below)
65*9f0612e4SBarry Smith #pragma omp parallel for schedule(static)
66*9f0612e4SBarry Smith     for (int j=0; j<size; j++) {
67*9f0612e4SBarry Smith       n = (N / size + ((N % size) > omp_get_thread_num()));
68*9f0612e4SBarry Smith       double *aa = a[j];  // these don't change the timings
69*9f0612e4SBarry Smith       const double *bb = b[j];
70*9f0612e4SBarry Smith       const double *cc = c[j];
71*9f0612e4SBarry Smith       for (int l=0; l<NTIMESINNER; l++) {
72*9f0612e4SBarry Smith         for (register int i=0; i<n; i++) aa[i] = bb[i]+scalar*cc[i];
73*9f0612e4SBarry Smith           if (size == 65) printf("never printed %g\n",a[0][11]);
74*9f0612e4SBarry Smith       }
75*9f0612e4SBarry Smith     }
76*9f0612e4SBarry Smith     times[k] = MPI_Wtime() - times[k];
77*9f0612e4SBarry Smith   }
78*9f0612e4SBarry Smith   for (int k=1; k<NTIMES; k++) {  /* note -- skip first iteration */
79*9f0612e4SBarry Smith       mintime = MIN(mintime, times[k]);
80*9f0612e4SBarry Smith   }
81*9f0612e4SBarry Smith   rate = 1.0E-06 * bytes*NTIMESINNER/mintime;
82*9f0612e4SBarry Smith 
83*9f0612e4SBarry Smith   if (size == 1) {
84*9f0612e4SBarry Smith     printf("%d %11.4f   Rate (MB/s) 1\n",size, rate);
85*9f0612e4SBarry Smith     fd = fopen("flops","w");
86*9f0612e4SBarry Smith     fprintf(fd,"%g\n",rate);
87*9f0612e4SBarry Smith     fclose(fd);
88*9f0612e4SBarry Smith   } else {
89*9f0612e4SBarry Smith     double prate;
90*9f0612e4SBarry Smith     fd = fopen("flops","r");
91*9f0612e4SBarry Smith     fscanf(fd,"%lg",&prate);
92*9f0612e4SBarry Smith     fclose(fd);
93*9f0612e4SBarry Smith     printf("%d %11.4f   Rate (MB/s) %g \n", size, rate,rate/prate);
94*9f0612e4SBarry Smith   }
95*9f0612e4SBarry Smith   return 0;
96*9f0612e4SBarry Smith }
97