xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 403adfb6d7fc5442a9d3fb849d3da717f6691bef)
1*403adfb6SMatthew G Knepley /*
2*403adfb6SMatthew G Knepley   STREAM benchmark implementation in CUDA.
3*403adfb6SMatthew G Knepley 
4*403adfb6SMatthew G Knepley     COPY:       a(i) = b(i)
5*403adfb6SMatthew G Knepley     SCALE:      a(i) = q*b(i)
6*403adfb6SMatthew G Knepley     SUM:        a(i) = b(i) + c(i)
7*403adfb6SMatthew G Knepley     TRIAD:      a(i) = b(i) + q*c(i)
8*403adfb6SMatthew G Knepley 
9*403adfb6SMatthew G Knepley   It measures the memory system on the device.
10*403adfb6SMatthew G Knepley   The implementation is in single precision.
11*403adfb6SMatthew G Knepley 
12*403adfb6SMatthew G Knepley   Code based on the code developed by John D. McCalpin
13*403adfb6SMatthew G Knepley   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14*403adfb6SMatthew G Knepley 
15*403adfb6SMatthew G Knepley   Written by: Massimiliano Fatica, NVIDIA Corporation
16*403adfb6SMatthew G Knepley   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17*403adfb6SMatthew G Knepley   Extensive Revisions, 4 December 2010
18*403adfb6SMatthew G Knepley   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19*403adfb6SMatthew G Knepley 
20*403adfb6SMatthew G Knepley   User interface motivated by bandwidthTest NVIDIA SDK example.
21*403adfb6SMatthew G Knepley */
22*403adfb6SMatthew G Knepley static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23*403adfb6SMatthew G Knepley                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24*403adfb6SMatthew G Knepley 
25*403adfb6SMatthew G Knepley #include <petscconf.h>
26*403adfb6SMatthew G Knepley #include <petscsys.h>
27*403adfb6SMatthew G Knepley #include <petsctime.h>
28*403adfb6SMatthew G Knepley 
29*403adfb6SMatthew G Knepley #define N	2000000
30*403adfb6SMatthew G Knepley #define NTIMES	10
31*403adfb6SMatthew G Knepley 
32*403adfb6SMatthew G Knepley # ifndef MIN
33*403adfb6SMatthew G Knepley # define MIN(x,y) ((x)<(y)?(x):(y))
34*403adfb6SMatthew G Knepley # endif
35*403adfb6SMatthew G Knepley # ifndef MAX
36*403adfb6SMatthew G Knepley # define MAX(x,y) ((x)>(y)?(x):(y))
37*403adfb6SMatthew G Knepley # endif
38*403adfb6SMatthew G Knepley 
39*403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f;
40*403adfb6SMatthew G Knepley 
41*403adfb6SMatthew G Knepley __global__ void set_array(float *a,  float value, size_t len)
42*403adfb6SMatthew G Knepley {
43*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44*403adfb6SMatthew G Knepley   while (idx < len) {
45*403adfb6SMatthew G Knepley     a[idx] = value;
46*403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
47*403adfb6SMatthew G Knepley   }
48*403adfb6SMatthew G Knepley }
49*403adfb6SMatthew G Knepley 
50*403adfb6SMatthew G Knepley __global__ void STREAM_Copy(float *a, float *b, size_t len)
51*403adfb6SMatthew G Knepley {
52*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
53*403adfb6SMatthew G Knepley   while (idx < len) {
54*403adfb6SMatthew G Knepley     b[idx] = a[idx];
55*403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
56*403adfb6SMatthew G Knepley   }
57*403adfb6SMatthew G Knepley }
58*403adfb6SMatthew G Knepley 
59*403adfb6SMatthew G Knepley __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
60*403adfb6SMatthew G Knepley {
61*403adfb6SMatthew G Knepley   /*
62*403adfb6SMatthew G Knepley    * Ensure size of thread index space is as large as or greater than
63*403adfb6SMatthew G Knepley    * vector index space else return.
64*403adfb6SMatthew G Knepley    */
65*403adfb6SMatthew G Knepley   if (blockDim.x * gridDim.x < len) return;
66*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
67*403adfb6SMatthew G Knepley   if (idx < len) b[idx] = a[idx];
68*403adfb6SMatthew G Knepley }
69*403adfb6SMatthew G Knepley 
70*403adfb6SMatthew G Knepley __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
71*403adfb6SMatthew G Knepley {
72*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73*403adfb6SMatthew G Knepley   while (idx < len) {
74*403adfb6SMatthew G Knepley     b[idx] = scale* a[idx];
75*403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
76*403adfb6SMatthew G Knepley   }
77*403adfb6SMatthew G Knepley }
78*403adfb6SMatthew G Knepley 
79*403adfb6SMatthew G Knepley __global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
80*403adfb6SMatthew G Knepley {
81*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
82*403adfb6SMatthew G Knepley   while (idx < len) {
83*403adfb6SMatthew G Knepley     c[idx] = a[idx]+b[idx];
84*403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
85*403adfb6SMatthew G Knepley   }
86*403adfb6SMatthew G Knepley }
87*403adfb6SMatthew G Knepley 
88*403adfb6SMatthew G Knepley __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
89*403adfb6SMatthew G Knepley {
90*403adfb6SMatthew G Knepley   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
91*403adfb6SMatthew G Knepley   while (idx < len) {
92*403adfb6SMatthew G Knepley     c[idx] = a[idx]+scalar*b[idx];
93*403adfb6SMatthew G Knepley     idx += blockDim.x * gridDim.x;
94*403adfb6SMatthew G Knepley   }
95*403adfb6SMatthew G Knepley }
96*403adfb6SMatthew G Knepley 
97*403adfb6SMatthew G Knepley /* Host side verification routines */
98*403adfb6SMatthew G Knepley bool STREAM_Copy_verify(float *a, float *b, size_t len) {
99*403adfb6SMatthew G Knepley   size_t idx;
100*403adfb6SMatthew G Knepley   bool bDifferent = false;
101*403adfb6SMatthew G Knepley 
102*403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
103*403adfb6SMatthew G Knepley     float expectedResult = a[idx];
104*403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
105*403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
106*403adfb6SMatthew G Knepley     /* element-wise relative error determination */
107*403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
108*403adfb6SMatthew G Knepley   }
109*403adfb6SMatthew G Knepley 
110*403adfb6SMatthew G Knepley   return bDifferent;
111*403adfb6SMatthew G Knepley }
112*403adfb6SMatthew G Knepley 
113*403adfb6SMatthew G Knepley bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
114*403adfb6SMatthew G Knepley   size_t idx;
115*403adfb6SMatthew G Knepley   bool bDifferent = false;
116*403adfb6SMatthew G Knepley 
117*403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
118*403adfb6SMatthew G Knepley     float expectedResult = scale*a[idx];
119*403adfb6SMatthew G Knepley     float diffResultExpected = (b[idx] - expectedResult);
120*403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
121*403adfb6SMatthew G Knepley     /* element-wise relative error determination */
122*403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
123*403adfb6SMatthew G Knepley   }
124*403adfb6SMatthew G Knepley 
125*403adfb6SMatthew G Knepley   return bDifferent;
126*403adfb6SMatthew G Knepley }
127*403adfb6SMatthew G Knepley 
128*403adfb6SMatthew G Knepley bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
129*403adfb6SMatthew G Knepley   size_t idx;
130*403adfb6SMatthew G Knepley   bool bDifferent = false;
131*403adfb6SMatthew G Knepley 
132*403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
133*403adfb6SMatthew G Knepley     float expectedResult = a[idx] + b[idx];
134*403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
135*403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
136*403adfb6SMatthew G Knepley     /* element-wise relative error determination */
137*403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 2.f);
138*403adfb6SMatthew G Knepley   }
139*403adfb6SMatthew G Knepley 
140*403adfb6SMatthew G Knepley   return bDifferent;
141*403adfb6SMatthew G Knepley }
142*403adfb6SMatthew G Knepley 
143*403adfb6SMatthew G Knepley bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
144*403adfb6SMatthew G Knepley   size_t idx;
145*403adfb6SMatthew G Knepley   bool bDifferent = false;
146*403adfb6SMatthew G Knepley 
147*403adfb6SMatthew G Knepley   for (idx = 0; idx < len && !bDifferent; idx++) {
148*403adfb6SMatthew G Knepley     float expectedResult = a[idx] + scalar*b[idx];
149*403adfb6SMatthew G Knepley     float diffResultExpected = (c[idx] - expectedResult);
150*403adfb6SMatthew G Knepley     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
151*403adfb6SMatthew G Knepley     /* element-wise relative error determination */
152*403adfb6SMatthew G Knepley     bDifferent = (relErrorULPS > 3.f);
153*403adfb6SMatthew G Knepley   }
154*403adfb6SMatthew G Knepley 
155*403adfb6SMatthew G Knepley   return bDifferent;
156*403adfb6SMatthew G Knepley }
157*403adfb6SMatthew G Knepley 
158*403adfb6SMatthew G Knepley /* forward declarations */
159*403adfb6SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool cpuTiming);
160*403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
161*403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]);
162*403adfb6SMatthew G Knepley 
163*403adfb6SMatthew G Knepley int main(int argc, char *argv[])
164*403adfb6SMatthew G Knepley {
165*403adfb6SMatthew G Knepley   PetscInt       device    = 0;
166*403adfb6SMatthew G Knepley   PetscBool      cpuTiming = PETSC_FALSE;
167*403adfb6SMatthew G Knepley   PetscErrorCode ierr;
168*403adfb6SMatthew G Knepley 
169*403adfb6SMatthew G Knepley   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
170*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
171*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
172*403adfb6SMatthew G Knepley 
173*403adfb6SMatthew G Knepley   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
174*403adfb6SMatthew G Knepley     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
175*403adfb6SMatthew G Knepley     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
176*403adfb6SMatthew G Knepley   ierr = PetscOptionsEnd();
177*403adfb6SMatthew G Knepley 
178*403adfb6SMatthew G Knepley   ierr = setupStream(device, cpuTiming);
179*403adfb6SMatthew G Knepley   if (ierr >= 0) {
180*403adfb6SMatthew G Knepley     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
181*403adfb6SMatthew G Knepley   }
182*403adfb6SMatthew G Knepley   PetscFinalize();
183*403adfb6SMatthew G Knepley   return 0;
184*403adfb6SMatthew G Knepley }
185*403adfb6SMatthew G Knepley 
186*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
187*403adfb6SMatthew G Knepley //Run the appropriate tests
188*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
189*403adfb6SMatthew G Knepley PetscErrorCode setupStream(PetscInt deviceNum, PetscBool cpuTiming)
190*403adfb6SMatthew G Knepley {
191*403adfb6SMatthew G Knepley   PetscInt       iNumThreadsPerBlock = 128;
192*403adfb6SMatthew G Knepley   PetscErrorCode ierr;
193*403adfb6SMatthew G Knepley 
194*403adfb6SMatthew G Knepley   PetscFunctionBegin;
195*403adfb6SMatthew G Knepley   // Check device
196*403adfb6SMatthew G Knepley   {
197*403adfb6SMatthew G Knepley     int deviceCount;
198*403adfb6SMatthew G Knepley 
199*403adfb6SMatthew G Knepley     cudaGetDeviceCount(&deviceCount);
200*403adfb6SMatthew G Knepley     if (deviceCount == 0) {
201*403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
202*403adfb6SMatthew G Knepley       return -1000;
203*403adfb6SMatthew G Knepley     }
204*403adfb6SMatthew G Knepley 
205*403adfb6SMatthew G Knepley     if (deviceNum >= deviceCount || deviceNum < 0) {
206*403adfb6SMatthew G Knepley       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
207*403adfb6SMatthew G Knepley       deviceNum = 0;
208*403adfb6SMatthew G Knepley     }
209*403adfb6SMatthew G Knepley   }
210*403adfb6SMatthew G Knepley 
211*403adfb6SMatthew G Knepley   cudaSetDevice(deviceNum);
212*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
213*403adfb6SMatthew G Knepley   cudaDeviceProp deviceProp;
214*403adfb6SMatthew G Knepley   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
215*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
216*403adfb6SMatthew G Knepley   } else {
217*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
218*403adfb6SMatthew G Knepley     return -1;
219*403adfb6SMatthew G Knepley   }
220*403adfb6SMatthew G Knepley 
221*403adfb6SMatthew G Knepley   if (deviceProp.major == 2 && deviceProp.minor == 1) {
222*403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
223*403adfb6SMatthew G Knepley   } else {
224*403adfb6SMatthew G Knepley     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
225*403adfb6SMatthew G Knepley   }
226*403adfb6SMatthew G Knepley 
227*403adfb6SMatthew G Knepley   /*cutilSafeCall(cudaSetDeviceFlags(cudaDeviceBlockingSync));*/
228*403adfb6SMatthew G Knepley 
229*403adfb6SMatthew G Knepley   if (cpuTiming) {
230*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
231*403adfb6SMatthew G Knepley   }
232*403adfb6SMatthew G Knepley 
233*403adfb6SMatthew G Knepley   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
234*403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
235*403adfb6SMatthew G Knepley }
236*403adfb6SMatthew G Knepley 
237*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
238*403adfb6SMatthew G Knepley // runStream
239*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
240*403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
241*403adfb6SMatthew G Knepley {
242*403adfb6SMatthew G Knepley   float *d_a, *d_b, *d_c;
243*403adfb6SMatthew G Knepley   int k;
244*403adfb6SMatthew G Knepley   float times[5][NTIMES];
245*403adfb6SMatthew G Knepley   float scalar;
246*403adfb6SMatthew G Knepley   PetscErrorCode ierr;
247*403adfb6SMatthew G Knepley 
248*403adfb6SMatthew G Knepley   PetscFunctionBegin;
249*403adfb6SMatthew G Knepley   /* Allocate memory on device */
250*403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
251*403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
252*403adfb6SMatthew G Knepley   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
253*403adfb6SMatthew G Knepley 
254*403adfb6SMatthew G Knepley   /* Compute execution configuration */
255*403adfb6SMatthew G Knepley 
256*403adfb6SMatthew G Knepley   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
257*403adfb6SMatthew G Knepley   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
258*403adfb6SMatthew G Knepley   if (N % dimBlock.x != 0) dimGrid.x+=1;
259*403adfb6SMatthew G Knepley 
260*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
261*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
262*403adfb6SMatthew G Knepley 
263*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
264*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
265*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
266*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
267*403adfb6SMatthew G Knepley 
268*403adfb6SMatthew G Knepley   /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
269*403adfb6SMatthew G Knepley   PetscLogDouble cpuTimer = 0.0;
270*403adfb6SMatthew G Knepley   cudaEvent_t start, stop;
271*403adfb6SMatthew G Knepley 
272*403adfb6SMatthew G Knepley   /* both timers report msec */
273*403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
274*403adfb6SMatthew G Knepley   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
275*403adfb6SMatthew G Knepley 
276*403adfb6SMatthew G Knepley   scalar=3.0f;
277*403adfb6SMatthew G Knepley   for(k = 0; k < NTIMES; ++k) {
278*403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
279*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
280*403adfb6SMatthew G Knepley     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
281*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
282*403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
283*403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
284*403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
285*403adfb6SMatthew G Knepley       PetscTimeAdd(cpuTimer);
286*403adfb6SMatthew G Knepley       times[0][k] = cpuTimer;
287*403adfb6SMatthew G Knepley     } else {
288*403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
289*403adfb6SMatthew G Knepley     }
290*403adfb6SMatthew G Knepley 
291*403adfb6SMatthew G Knepley     cpuTimer = 0.0;
292*403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
293*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
294*403adfb6SMatthew G Knepley     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
295*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
296*403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
297*403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
298*403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
299*403adfb6SMatthew G Knepley       PetscTimeAdd(cpuTimer);
300*403adfb6SMatthew G Knepley       times[1][k] = cpuTimer;
301*403adfb6SMatthew G Knepley     } else {
302*403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
303*403adfb6SMatthew G Knepley     }
304*403adfb6SMatthew G Knepley 
305*403adfb6SMatthew G Knepley     cpuTimer = 0.0;
306*403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
307*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
308*403adfb6SMatthew G Knepley     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
309*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
310*403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
311*403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
312*403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
313*403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
314*403adfb6SMatthew G Knepley       times[2][k] = cpuTimer;
315*403adfb6SMatthew G Knepley     } else {
316*403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
317*403adfb6SMatthew G Knepley     }
318*403adfb6SMatthew G Knepley 
319*403adfb6SMatthew G Knepley     cpuTimer = 0.0;
320*403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
321*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
322*403adfb6SMatthew G Knepley     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
323*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
324*403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
325*403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
326*403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
327*403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
328*403adfb6SMatthew G Knepley       times[3][k] = cpuTimer;
329*403adfb6SMatthew G Knepley     } else {
330*403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
331*403adfb6SMatthew G Knepley     }
332*403adfb6SMatthew G Knepley 
333*403adfb6SMatthew G Knepley     cpuTimer = 0.0;
334*403adfb6SMatthew G Knepley     PetscTimeSubtract(cpuTimer);
335*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
336*403adfb6SMatthew G Knepley     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
337*403adfb6SMatthew G Knepley     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
338*403adfb6SMatthew G Knepley     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
339*403adfb6SMatthew G Knepley     //get the the total elapsed time in ms
340*403adfb6SMatthew G Knepley     PetscTimeAdd(cpuTimer);
341*403adfb6SMatthew G Knepley     if (bDontUseGPUTiming) {
342*403adfb6SMatthew G Knepley       times[4][k] = cpuTimer;
343*403adfb6SMatthew G Knepley     } else {
344*403adfb6SMatthew G Knepley       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
345*403adfb6SMatthew G Knepley     }
346*403adfb6SMatthew G Knepley 
347*403adfb6SMatthew G Knepley   }
348*403adfb6SMatthew G Knepley 
349*403adfb6SMatthew G Knepley   /* verify kernels */
350*403adfb6SMatthew G Knepley   float *h_a, *h_b, *h_c;
351*403adfb6SMatthew G Knepley   bool errorSTREAMkernel = true;
352*403adfb6SMatthew G Knepley 
353*403adfb6SMatthew G Knepley   if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
354*403adfb6SMatthew G Knepley     printf("Unable to allocate array h_a, exiting ...\n");
355*403adfb6SMatthew G Knepley     exit(1);
356*403adfb6SMatthew G Knepley   }
357*403adfb6SMatthew G Knepley   if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
358*403adfb6SMatthew G Knepley     printf("Unable to allocate array h_b, exiting ...\n");
359*403adfb6SMatthew G Knepley     exit(1);
360*403adfb6SMatthew G Knepley   }
361*403adfb6SMatthew G Knepley 
362*403adfb6SMatthew G Knepley   if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
363*403adfb6SMatthew G Knepley     printf("Unalbe to allocate array h_c, exiting ...\n");
364*403adfb6SMatthew G Knepley     exit(1);
365*403adfb6SMatthew G Knepley   }
366*403adfb6SMatthew G Knepley 
367*403adfb6SMatthew G Knepley   /*
368*403adfb6SMatthew G Knepley    * perform kernel, copy device memory into host memory and verify each
369*403adfb6SMatthew G Knepley    * device kernel output
370*403adfb6SMatthew G Knepley    */
371*403adfb6SMatthew G Knepley 
372*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
373*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
374*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
375*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
376*403adfb6SMatthew G Knepley 
377*403adfb6SMatthew G Knepley   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
378*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
379*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
380*403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
381*403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
382*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
383*403adfb6SMatthew G Knepley     exit(-2000);
384*403adfb6SMatthew G Knepley   } else {
385*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
386*403adfb6SMatthew G Knepley   }
387*403adfb6SMatthew G Knepley 
388*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
389*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
390*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
391*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
392*403adfb6SMatthew G Knepley 
393*403adfb6SMatthew G Knepley   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
394*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
395*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
396*403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
397*403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
398*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
399*403adfb6SMatthew G Knepley     exit(-3000);
400*403adfb6SMatthew G Knepley   } else {
401*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
402*403adfb6SMatthew G Knepley   }
403*403adfb6SMatthew G Knepley 
404*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
405*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
406*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
407*403adfb6SMatthew G Knepley 
408*403adfb6SMatthew G Knepley   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
409*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
410*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
411*403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
412*403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
413*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
414*403adfb6SMatthew G Knepley     exit(-4000);
415*403adfb6SMatthew G Knepley   } else {
416*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
417*403adfb6SMatthew G Knepley   }
418*403adfb6SMatthew G Knepley 
419*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
420*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
421*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
422*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
423*403adfb6SMatthew G Knepley 
424*403adfb6SMatthew G Knepley   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
425*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
426*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
427*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
428*403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
429*403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
430*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
431*403adfb6SMatthew G Knepley     exit(-5000);
432*403adfb6SMatthew G Knepley   } else {
433*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
434*403adfb6SMatthew G Knepley   }
435*403adfb6SMatthew G Knepley 
436*403adfb6SMatthew G Knepley   /* Initialize memory on the device */
437*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
438*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
439*403adfb6SMatthew G Knepley   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
440*403adfb6SMatthew G Knepley 
441*403adfb6SMatthew G Knepley   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
442*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
443*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
444*403adfb6SMatthew G Knepley   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
445*403adfb6SMatthew G Knepley   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
446*403adfb6SMatthew G Knepley   if (errorSTREAMkernel) {
447*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
448*403adfb6SMatthew G Knepley     exit(-6000);
449*403adfb6SMatthew G Knepley   } else {
450*403adfb6SMatthew G Knepley     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
451*403adfb6SMatthew G Knepley   }
452*403adfb6SMatthew G Knepley 
453*403adfb6SMatthew G Knepley   /* continue from here */
454*403adfb6SMatthew G Knepley   printResultsReadable(times);
455*403adfb6SMatthew G Knepley 
456*403adfb6SMatthew G Knepley   //clean up timers
457*403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
458*403adfb6SMatthew G Knepley   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
459*403adfb6SMatthew G Knepley 
460*403adfb6SMatthew G Knepley   /* Free memory on device */
461*403adfb6SMatthew G Knepley   ierr = cudaFree(d_a);CHKERRQ(ierr);
462*403adfb6SMatthew G Knepley   ierr = cudaFree(d_b);CHKERRQ(ierr);
463*403adfb6SMatthew G Knepley   ierr = cudaFree(d_c);CHKERRQ(ierr);
464*403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
465*403adfb6SMatthew G Knepley }
466*403adfb6SMatthew G Knepley 
467*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
468*403adfb6SMatthew G Knepley //Print Results to Screen and File
469*403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
470*403adfb6SMatthew G Knepley PetscErrorCode printResultsReadable(float times[][NTIMES]) {
471*403adfb6SMatthew G Knepley   PetscErrorCode ierr;
472*403adfb6SMatthew G Knepley   PetscInt       j, k;
473*403adfb6SMatthew G Knepley   float	avgtime[5] = {0., 0., 0., 0., 0.};
474*403adfb6SMatthew G Knepley   float	maxtime[5] = {0., 0., 0., 0., 0.};
475*403adfb6SMatthew G Knepley   float	mintime[5] = {1e30,1e30,1e30,1e30,1e30};
476*403adfb6SMatthew G Knepley   char *label[5]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Add:       ", "Triad:     "};
477*403adfb6SMatthew G Knepley   float	bytes_per_kernel[5] = {
478*403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
479*403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
480*403adfb6SMatthew G Knepley     2. * sizeof(float) * N,
481*403adfb6SMatthew G Knepley     3. * sizeof(float) * N,
482*403adfb6SMatthew G Knepley     3. * sizeof(float) * N
483*403adfb6SMatthew G Knepley   };
484*403adfb6SMatthew G Knepley 
485*403adfb6SMatthew G Knepley   PetscFunctionBegin;
486*403adfb6SMatthew G Knepley   /* --- SUMMARY --- */
487*403adfb6SMatthew G Knepley   for(k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
488*403adfb6SMatthew G Knepley     for(j = 0; j < 5; ++j) {
489*403adfb6SMatthew G Knepley       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
490*403adfb6SMatthew G Knepley       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
491*403adfb6SMatthew G Knepley       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
492*403adfb6SMatthew G Knepley     }
493*403adfb6SMatthew G Knepley   }
494*403adfb6SMatthew G Knepley 
495*403adfb6SMatthew G Knepley   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
496*403adfb6SMatthew G Knepley 
497*403adfb6SMatthew G Knepley   for(j = 0; j < 5; ++j) {
498*403adfb6SMatthew G Knepley      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
499*403adfb6SMatthew G Knepley      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
500*403adfb6SMatthew G Knepley   }
501*403adfb6SMatthew G Knepley   PetscFunctionReturn(0);
502*403adfb6SMatthew G Knepley }
503