# include # include # include # include #include /* Program: Stream Programmer: Joe R. Zagar Revision: 4.0-BETA, October 24, 1995 Original code developed by John D. McCalpin This program measures memory transfer rates in MB/s for simple computational kernels coded in C. These numbers reveal the quality of code generation for simple uncacheable kernels as well as showing the cost of floating-point operations relative to memory accesses. INSTRUCTIONS: 1) Stream requires a good bit of memory to run. Adjust the value of 'N' (below) to give a 'timing calibration' of at least 20 clock-ticks. This will provide rate estimates that should be good to about 5% precision. */ # define N 2000000 # define NTIMES 50 # define OFFSET 0 /* 3) Compile the code with full optimization. Many compilers generate unreasonably bad code before the optimizer tightens things up. If the results are unreasonably good, on the other hand, the optimizer might be too smart for me! Try compiling with: cc -O stream_d.c second.c -o stream_d -lm This is known to work on Cray, SGI, IBM, and Sun machines. 4) Mail the results to mccalpin@cs.virginia.edu Be sure to include: a) computer hardware model number and software revision b) the compiler flags c) all of the output from the test case. Thanks! */ # define HLINE "-------------------------------------------------------------\n" # ifndef MIN # define MIN(x,y) ((x)<(y) ? (x) : (y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y) ? (x) : (y)) # endif static double a[N+OFFSET], b[N+OFFSET], c[N+OFFSET]; /*double *a,*b,*c;*/ static double mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; static double bytes[4] = { 2 * sizeof(double) * N, 2 * sizeof(double) * N, 3 * sizeof(double) * N, 3 * sizeof(double) * N }; int main(int argc,char **args) { int quantum, checktick(void); register int j, k; double scalar, t, times[4][NTIMES],irate[4],rate[4]; int rank,size,resultlen; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; int ierr; FILE *fd; ierr = PetscInitialize(&argc,&args,NULL,NULL);if (ierr) return ierr; ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);if (ierr) return ierr; ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);if (ierr) return ierr; for (j=0; j= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ else ; /* printf("Your clock granularity appears to be less than one microsecond.\n");*/ } t = MPI_Wtime(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (MPI_Wtime() - t); if (rank == 0) { /* printf("Each test below will take on the order of %d microseconds.\n", (int) t); printf(" (= %d clock ticks)\n", (int) (t/quantum)); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE);*/ } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k