1 2 # include <stdio.h> 3 # include <math.h> 4 # include <limits.h> 5 # include <float.h> 6 #include <petscsys.h> 7 8 /* 9 Program: Stream 10 Programmer: Joe R. Zagar 11 Revision: 4.0-BETA, October 24, 1995 12 Original code developed by John D. McCalpin 13 14 This program measures memory transfer rates in MB/s for simple 15 computational kernels coded in C. These numbers reveal the quality 16 of code generation for simple uncacheable kernels as well as showing 17 the cost of floating-point operations relative to memory accesses. 18 19 INSTRUCTIONS: 20 21 1) Stream requires a good bit of memory to run. Adjust the 22 value of 'N' (below) to give a 'timing calibration' of 23 at least 20 clock-ticks. This will provide rate estimates 24 that should be good to about 5% precision. 25 */ 26 27 # define N 2000000 28 # define NTIMES 50 29 # define OFFSET 0 30 31 /* 32 3) Compile the code with full optimization. Many compilers 33 generate unreasonably bad code before the optimizer tightens 34 things up. If the results are unreasonably good, on the 35 other hand, the optimizer might be too smart for me! 36 37 Try compiling with: 38 cc -O stream_d.c second.c -o stream_d -lm 39 40 This is known to work on Cray, SGI, IBM, and Sun machines. 41 42 4) Mail the results to mccalpin@cs.virginia.edu 43 Be sure to include: 44 a) computer hardware model number and software revision 45 b) the compiler flags 46 c) all of the output from the test case. 47 Thanks! 48 49 */ 50 51 # define HLINE "-------------------------------------------------------------\n" 52 53 # ifndef MIN 54 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 55 # endif 56 # ifndef MAX 57 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 58 # endif 59 60 static double a[N+OFFSET], 61 b[N+OFFSET], 62 c[N+OFFSET]; 63 /*double *a,*b,*c;*/ 64 65 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 66 67 static double bytes[4] = { 68 2 * sizeof(double) * N, 69 2 * sizeof(double) * N, 70 3 * sizeof(double) * N, 71 3 * sizeof(double) * N 72 }; 73 74 int main(int argc,char **args) 75 { 76 int quantum, checktick(void); 77 register int j, k; 78 double scalar, t, times[4][NTIMES],irate[4],rate[4]; 79 int rank,size,resultlen; 80 char hostname[MPI_MAX_PROCESSOR_NAME]; 81 MPI_Status status; 82 int ierr; 83 FILE *fd; 84 85 ierr = PetscInitialize(&argc,&args,NULL,NULL);if (ierr) return ierr; 86 ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr; 87 ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr; 88 89 for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) { 90 hostname[j] = 0; 91 } 92 ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr; 93 if (!rank) { 94 for (j=1; j<size; j++) { 95 ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr; 96 } 97 } else { 98 ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr; 99 } 100 ierr = MPI_Barrier(MPI_COMM_WORLD); 101 102 /* --- SETUP --- determine precision and check timing --- */ 103 104 if (!rank) { 105 /*printf(HLINE); 106 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 107 printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 108 printf("Each test is run %d times, but only\n", NTIMES); 109 printf("the *best* time for each is used.\n"); 110 printf(HLINE); */ 111 } 112 113 /* Get initial value for system clock. */ 114 115 /* a = malloc(N*sizeof(double)); 116 b = malloc(N*sizeof(double)); 117 c = malloc(N*sizeof(double));*/ 118 for (j=0; j<N; j++) { 119 a[j] = 1.0; 120 b[j] = 2.0; 121 c[j] = 0.0; 122 } 123 124 if (!rank) { 125 if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 126 else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 127 } 128 129 t = MPI_Wtime(); 130 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 131 t = 1.0E6 * (MPI_Wtime() - t); 132 133 if (!rank) { 134 /* printf("Each test below will take on the order of %d microseconds.\n", (int) t); 135 printf(" (= %d clock ticks)\n", (int) (t/quantum)); 136 printf("Increase the size of the arrays if this shows that\n"); 137 printf("you are not getting at least 20 clock ticks per test.\n"); 138 printf(HLINE);*/ 139 } 140 141 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 142 143 scalar = 3.0; 144 for (k=0; k<NTIMES; k++) 145 { 146 ierr = MPI_Barrier(MPI_COMM_WORLD); 147 times[0][k] = MPI_Wtime(); 148 /* should all these barriers be pulled outside of the time call? */ 149 ierr = MPI_Barrier(MPI_COMM_WORLD); 150 for (j=0; j<N; j++) c[j] = a[j]; 151 ierr = MPI_Barrier(MPI_COMM_WORLD); 152 times[0][k] = MPI_Wtime() - times[0][k]; 153 154 times[1][k] = MPI_Wtime(); 155 ierr = MPI_Barrier(MPI_COMM_WORLD); 156 for (j=0; j<N; j++) b[j] = scalar*c[j]; 157 ierr = MPI_Barrier(MPI_COMM_WORLD); 158 times[1][k] = MPI_Wtime() - times[1][k]; 159 160 times[2][k] = MPI_Wtime(); 161 ierr = MPI_Barrier(MPI_COMM_WORLD); 162 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 163 ierr = MPI_Barrier(MPI_COMM_WORLD); 164 times[2][k] = MPI_Wtime() - times[2][k]; 165 166 times[3][k] = MPI_Wtime(); 167 ierr = MPI_Barrier(MPI_COMM_WORLD); 168 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 169 ierr = MPI_Barrier(MPI_COMM_WORLD); 170 times[3][k] = MPI_Wtime() - times[3][k]; 171 } 172 173 /* --- SUMMARY --- */ 174 175 for (k=0; k<NTIMES; k++) 176 for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]); 177 178 for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 179 ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 180 if (ierr) printf("Error calling MPI\n"); 181 182 if (!rank) { 183 if (size == 1) { 184 printf("%d %11.4f Rate (MB/s)\n",size, rate[3]); 185 fd = fopen("flops","w"); 186 fprintf(fd,"%g\n",rate[3]); 187 fclose(fd); 188 } else { 189 double prate; 190 fd = fopen("flops","r"); 191 fscanf(fd,"%lg",&prate); 192 fclose(fd); 193 printf("%d %11.4f Rate (MB/s) %g \n", size, rate[3],rate[3]/prate); 194 } 195 } 196 PetscFinalize(); 197 return 0; 198 } 199 200 # define M 20 201 202 int checktick(void) 203 { 204 int i, minDelta, Delta; 205 double t1, t2, timesfound[M]; 206 207 /* Collect a sequence of M unique time values from the system. */ 208 209 for (i = 0; i < M; i++) { 210 t1 = MPI_Wtime(); 211 while (((t2=MPI_Wtime()) - t1) < 1.0E-6) ; 212 timesfound[i] = t1 = t2; 213 } 214 215 /* 216 Determine the minimum difference between these M values. 217 This result will be our estimate (in microseconds) for the 218 clock granularity. 219 */ 220 221 minDelta = 1000000; 222 for (i = 1; i < M; i++) { 223 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 224 minDelta = MIN(minDelta, MAX(Delta,0)); 225 } 226 227 return(minDelta); 228 } 229 230