1 2 # include <stdio.h> 3 # include <math.h> 4 # include <limits.h> 5 # include <float.h> 6 #include <petscsys.h> 7 8 /* 9 Program: Stream 10 Programmer: Joe R. Zagar 11 Revision: 4.0-BETA, October 24, 1995 12 Original code developed by John D. McCalpin 13 14 This program measures memory transfer rates in MB/s for simple 15 computational kernels coded in C. These numbers reveal the quality 16 of code generation for simple uncacheable kernels as well as showing 17 the cost of floating-point operations relative to memory accesses. 18 19 INSTRUCTIONS: 20 21 1) Stream requires a good bit of memory to run. Adjust the 22 value of 'N' (below) to give a 'timing calibration' of 23 at least 20 clock-ticks. This will provide rate estimates 24 that should be good to about 5% precision. 25 */ 26 27 # define N 2000000 28 # define NTIMES 50 29 # define OFFSET 0 30 31 /* 32 3) Compile the code with full optimization. Many compilers 33 generate unreasonably bad code before the optimizer tightens 34 things up. If the results are unreasonably good, on the 35 other hand, the optimizer might be too smart for me! 36 37 Try compiling with: 38 cc -O stream_d.c second.c -o stream_d -lm 39 40 This is known to work on Cray, SGI, IBM, and Sun machines. 41 42 43 4) Mail the results to mccalpin@cs.virginia.edu 44 Be sure to include: 45 a) computer hardware model number and software revision 46 b) the compiler flags 47 c) all of the output from the test case. 48 Thanks! 49 50 */ 51 52 # define HLINE "-------------------------------------------------------------\n" 53 54 # ifndef MIN 55 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 56 # endif 57 # ifndef MAX 58 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 59 # endif 60 61 static double a[N+OFFSET], 62 b[N+OFFSET], 63 c[N+OFFSET]; 64 /*double *a,*b,*c;*/ 65 66 static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 67 68 static double bytes[4] = { 69 2 * sizeof(double) * N, 70 2 * sizeof(double) * N, 71 3 * sizeof(double) * N, 72 3 * sizeof(double) * N 73 }; 74 75 int main(int argc,char **args) 76 { 77 int quantum, checktick(void); 78 register int j, k; 79 double scalar, t, times[4][NTIMES],irate[4],rate[4]; 80 int rank,size,resultlen; 81 char hostname[MPI_MAX_PROCESSOR_NAME]; 82 MPI_Status status; 83 int ierr; 84 FILE *fd; 85 86 ierr = PetscInitialize(&argc,&args,NULL,NULL);if (ierr) return ierr; 87 ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr; 88 ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr; 89 90 for (j=0; j<MPI_MAX_PROCESSOR_NAME; j++) { 91 hostname[j] = 0; 92 } 93 ierr = MPI_Get_processor_name(hostname,&resultlen);if (ierr) return ierr; 94 if (!rank) { 95 for (j=1; j<size; j++) { 96 ierr = MPI_Recv(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,j,0,MPI_COMM_WORLD,&status);if (ierr) return ierr; 97 } 98 } else { 99 ierr = MPI_Send(hostname,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,0,0,MPI_COMM_WORLD);if (ierr) return ierr; 100 } 101 ierr = MPI_Barrier(MPI_COMM_WORLD); 102 103 /* --- SETUP --- determine precision and check timing --- */ 104 105 if (!rank) { 106 /*printf(HLINE); 107 printf("Array size = %d, Offset = %d\n" , N, OFFSET); 108 printf("Total memory required = %.1f MB.\n", (3 * N * BytesPerWord) / 1048576.0); 109 printf("Each test is run %d times, but only\n", NTIMES); 110 printf("the *best* time for each is used.\n"); 111 printf(HLINE); */ 112 } 113 114 /* Get initial value for system clock. */ 115 116 /* a = malloc(N*sizeof(double)); 117 b = malloc(N*sizeof(double)); 118 c = malloc(N*sizeof(double));*/ 119 for (j=0; j<N; j++) { 120 a[j] = 1.0; 121 b[j] = 2.0; 122 c[j] = 0.0; 123 } 124 125 if (!rank) { 126 if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ 127 else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ 128 } 129 130 t = MPI_Wtime(); 131 for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; 132 t = 1.0E6 * (MPI_Wtime() - t); 133 134 if (!rank) { 135 /* printf("Each test below will take on the order of %d microseconds.\n", (int) t); 136 printf(" (= %d clock ticks)\n", (int) (t/quantum)); 137 printf("Increase the size of the arrays if this shows that\n"); 138 printf("you are not getting at least 20 clock ticks per test.\n"); 139 printf(HLINE);*/ 140 } 141 142 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 143 144 scalar = 3.0; 145 for (k=0; k<NTIMES; k++) 146 { 147 ierr = MPI_Barrier(MPI_COMM_WORLD); 148 times[0][k] = MPI_Wtime(); 149 /* should all these barriers be pulled outside of the time call? */ 150 ierr = MPI_Barrier(MPI_COMM_WORLD); 151 for (j=0; j<N; j++) c[j] = a[j]; 152 ierr = MPI_Barrier(MPI_COMM_WORLD); 153 times[0][k] = MPI_Wtime() - times[0][k]; 154 155 times[1][k] = MPI_Wtime(); 156 ierr = MPI_Barrier(MPI_COMM_WORLD); 157 for (j=0; j<N; j++) b[j] = scalar*c[j]; 158 ierr = MPI_Barrier(MPI_COMM_WORLD); 159 times[1][k] = MPI_Wtime() - times[1][k]; 160 161 times[2][k] = MPI_Wtime(); 162 ierr = MPI_Barrier(MPI_COMM_WORLD); 163 for (j=0; j<N; j++) c[j] = a[j]+b[j]; 164 ierr = MPI_Barrier(MPI_COMM_WORLD); 165 times[2][k] = MPI_Wtime() - times[2][k]; 166 167 times[3][k] = MPI_Wtime(); 168 ierr = MPI_Barrier(MPI_COMM_WORLD); 169 for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; 170 ierr = MPI_Barrier(MPI_COMM_WORLD); 171 times[3][k] = MPI_Wtime() - times[3][k]; 172 } 173 174 /* --- SUMMARY --- */ 175 176 for (k=0; k<NTIMES; k++) 177 for (j=0; j<4; j++) mintime[j] = MIN(mintime[j], times[j][k]); 178 179 for (j=0; j<4; j++) irate[j] = 1.0E-06 * bytes[j]/mintime[j]; 180 ierr = MPI_Reduce(irate,rate,4,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 181 if (ierr) printf("Error calling MPI\n"); 182 183 if (!rank) { 184 if (size == 1) { 185 printf("%d %11.4f Rate (MB/s)\n",size, rate[3]); 186 fd = fopen("flops","w"); 187 fprintf(fd,"%g\n",rate[3]); 188 fclose(fd); 189 } else { 190 double prate; 191 fd = fopen("flops","r"); 192 fscanf(fd,"%lg",&prate); 193 fclose(fd); 194 printf("%d %11.4f Rate (MB/s) %g \n", size, rate[3],rate[3]/prate); 195 } 196 } 197 PetscFinalize(); 198 return 0; 199 } 200 201 # define M 20 202 203 int checktick(void) 204 { 205 int i, minDelta, Delta; 206 double t1, t2, timesfound[M]; 207 208 /* Collect a sequence of M unique time values from the system. */ 209 210 for (i = 0; i < M; i++) { 211 t1 = MPI_Wtime(); 212 while (((t2=MPI_Wtime()) - t1) < 1.0E-6) ; 213 timesfound[i] = t1 = t2; 214 } 215 216 /* 217 Determine the minimum difference between these M values. 218 This result will be our estimate (in microseconds) for the 219 clock granularity. 220 */ 221 222 minDelta = 1000000; 223 for (i = 1; i < M; i++) { 224 Delta = (int)(1.0E6 * (timesfound[i]-timesfound[i-1])); 225 minDelta = MIN(minDelta, MAX(Delta,0)); 226 } 227 228 return(minDelta); 229 } 230 231