1 #include <stdio.h> 2 #include <math.h> 3 #include <limits.h> 4 #include <float.h> 5 #include <petscsys.h> 6 7 /* 8 Program: Stream 9 Programmer: Joe R. Zagar 10 Revision: 4.0-BETA, October 24, 1995 11 Original code developed by John D. McCalpin 12 13 This program measures memory transfer rates in MB/s for simple 14 computational kernels coded in C. These numbers reveal the quality 15 of code generation for simple uncacheable kernels as well as showing 16 the cost of floating-point operations relative to memory accesses. 17 18 INSTRUCTIONS: 19 20 1) Stream requires a good bit of memory to run. Adjust the 21 value of 'N' (below) to give a 'timing calibration' of 22 at least 20 clock-ticks. This will provide rate estimates 23 that should be good to about 5% precision. 24 */ 25 26 #define N 2000000 27 #define M 20 28 #define NTIMES 50 29 #define OFFSET 0 30 31 /* 32 3) Compile the code with full optimization. Many compilers 33 generate unreasonably bad code before the optimizer tightens 34 things up. If the results are unreasonably good, on the 35 other hand, the optimizer might be too smart for me! 36 37 Try compiling with: 38 cc -O stream_d.c second.c -o stream_d -lm 39 40 This is known to work on Cray, SGI, IBM, and Sun machines. 41 42 4) Mail the results to mccalpin@cs.virginia.edu 43 Be sure to include: 44 a) computer hardware model number and software revision 45 b) the compiler flags 46 c) all of the output from the test case. 47 Thanks! 48 49 */ 50 51 #define HLINE "-------------------------------------------------------------\n" 52 53 static double a[N+OFFSET],b[N+OFFSET],c[N+OFFSET]; 54 55 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 56 57 static int checktick(void) 58 { 59 int minDelta = 1000000; 60 double timesfound[M]; 61 62 /* Collect a sequence of M unique time values from the system. */ 63 64 for (int i = 0; i < M; ++i) { 65 const double t1 = MPI_Wtime(); 66 67 while (((timesfound[i] = MPI_Wtime()) - t1) < 1.0E-6) ; 68 } 69 70 /* 71 Determine the minimum difference between these M values. 72 This result will be our estimate (in microseconds) for the 73 clock granularity. 74 */ 75 76 for (int i = 1; i < M; ++i) { 77 int Delta = (int)(1.0E6*(timesfound[i]-timesfound[i-1])); 78 79 minDelta = PetscMin(minDelta,PetscMax(Delta,0)); 80 } 81 return minDelta; 82 } 83 84 static double bytes[4] = { 85 2 * sizeof(double) * N, 86 2 * sizeof(double) * N, 87 3 * sizeof(double) * N, 88 3 * sizeof(double) * N 89 }; 90 91 int main(int argc,char **args) 92 { 93 const double scalar = 3.0; 94 double t,times[4][NTIMES],irate[4],rate[4]; 95 PetscMPIInt rank,size,resultlen; 96 char hostname[MPI_MAX_PROCESSOR_NAME] = {0}; 97 98 PetscCall(PetscInitialize(&argc,&args,NULL,NULL)); 99 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank)); 100 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size)); 101 102 PetscCallMPI(MPI_Get_processor_name(hostname,&resultlen));(void)resultlen; 103 if (rank) PetscCallMPI(MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD)); 104 else { 105 for (int j = 1; j < size; ++j) { 106 PetscCallMPI(MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,MPI_STATUS_IGNORE)); 107 } 108 } 109 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 110 111 /* --- SETUP --- determine precision and check timing --- */ 112 113 if (rank == 0) { 114 /* 115 printf(HLINE); 116 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 117 printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 118 printf("Each test is run %d times, but only\n", NTIMES); 119 printf("the *best* time for each is used.\n"); 120 printf(HLINE); 121 */ 122 } 123 124 /* Get initial value for system clock. */ 125 for (int j = 0; j < N; ++j) { 126 a[j] = 1.0; 127 b[j] = 2.0; 128 c[j] = 0.0; 129 } 130 131 if (rank == 0) { 132 int quantum; 133 if ((quantum = checktick()) >= 1) { } /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 134 else { } /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 135 } 136 137 t = MPI_Wtime(); 138 for (int j = 0; j < N; ++j) a[j] *= 2.0; 139 t = 1.0E6 * (MPI_Wtime() - t); 140 141 if (rank == 0) { 142 /* 143 printf("Each test below will take on the order of %d microseconds.\n", (int) t); 144 printf(" (= %d clock ticks)\n", (int) (t/quantum)); 145 printf("Increase the size of the arrays if this shows that\n"); 146 printf("you are not getting at least 20 clock ticks per test.\n"); 147 printf(HLINE); 148 */ 149 } 150 151 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 152 153 for (int k = 0; k < NTIMES; ++k) { 154 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 155 times[0][k] = MPI_Wtime(); 156 /* should all these barriers be pulled outside of the time call? */ 157 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 158 PetscCall(PetscArraycpy(c,a,N)); 159 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 160 times[0][k] = MPI_Wtime() - times[0][k]; 161 162 times[1][k] = MPI_Wtime(); 163 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 164 for (int j = 0; j < N; ++j) b[j] = scalar*c[j]; 165 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 166 times[1][k] = MPI_Wtime() - times[1][k]; 167 168 times[2][k] = MPI_Wtime(); 169 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 170 for (int j = 0; j < N; ++j) c[j] = a[j]+b[j]; 171 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 172 times[2][k] = MPI_Wtime() - times[2][k]; 173 174 times[3][k] = MPI_Wtime(); 175 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 176 for (int j = 0; j < N; ++j) a[j] = b[j]+scalar*c[j]; 177 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 178 times[3][k] = MPI_Wtime() - times[3][k]; 179 } 180 181 /* --- SUMMARY --- */ 182 183 for (int k = 0; k < NTIMES; ++k) { 184 for (int j = 0; j < 4; ++j) mintime[j] = PetscMin(mintime[j],times[j][k]); 185 } 186 187 for (int j = 0; j < 4; ++j) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 188 PetscCallMPI(MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD)); 189 190 if (!rank) { 191 FILE *fd; 192 193 if (size != 1) { 194 double prate; 195 196 fd = fopen("flops","r"); 197 fscanf(fd,"%lg",&prate); 198 fclose(fd); 199 printf("%d %11.4f Rate (MB/s) %g \n",size,rate[3],rate[3]/prate); 200 } else { 201 fd = fopen("flops","w"); 202 fprintf(fd,"%g\n",rate[3]); 203 fclose(fd); 204 printf("%d %11.4f Rate (MB/s)\n",size,rate[3]); 205 } 206 } 207 PetscCall(PetscFinalize()); 208 return 0; 209 } 210