xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision db4deed73f06ae556a0f2e02a0fd6e016e156849)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in single precision.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24 
25 #include <petscconf.h>
26 #include <petscsys.h>
27 #include <petsctime.h>
28 
29 #define N        2000000
30 #define N_DOUBLE 8000000
31 #define NTIMES   10
32 
33 # ifndef MIN
34 # define MIN(x,y) ((x)<(y)?(x):(y))
35 # endif
36 # ifndef MAX
37 # define MAX(x,y) ((x)>(y)?(x):(y))
38 # endif
39 
40 const float flt_eps  = 1.192092896e-07f;
41 const double dbl_eps = 2.2204460492503131e-16;
42 
43 __global__ void set_array(float *a,  float value, size_t len)
44 {
45   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
46   while (idx < len) {
47     a[idx] = value;
48     idx += blockDim.x * gridDim.x;
49   }
50 }
51 
52 __global__ void set_array_double(double *a,  double value, size_t len)
53 {
54   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
55   while (idx < len) {
56     a[idx] = value;
57     idx += blockDim.x * gridDim.x;
58   }
59 }
60 
61 __global__ void STREAM_Copy(float *a, float *b, size_t len)
62 {
63   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
64   while (idx < len) {
65     b[idx] = a[idx];
66     idx += blockDim.x * gridDim.x;
67   }
68 }
69 
70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
71 {
72   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73   while (idx < len) {
74     b[idx] = a[idx];
75     idx += blockDim.x * gridDim.x;
76   }
77 }
78 
79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
80 {
81   /*
82    * Ensure size of thread index space is as large as or greater than
83    * vector index space else return.
84    */
85   if (blockDim.x * gridDim.x < len) return;
86   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
87   if (idx < len) b[idx] = a[idx];
88 }
89 
90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
91 {
92   /*
93    * Ensure size of thread index space is as large as or greater than
94    * vector index space else return.
95    */
96   if (blockDim.x * gridDim.x < len) return;
97   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
98   if (idx < len) b[idx] = a[idx];
99 }
100 
101 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
102 {
103   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104   while (idx < len) {
105     b[idx] = scale* a[idx];
106     idx += blockDim.x * gridDim.x;
107   }
108 }
109 
110 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
111 {
112   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
113   while (idx < len) {
114     b[idx] = scale* a[idx];
115     idx += blockDim.x * gridDim.x;
116   }
117 }
118 
119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
120 {
121   /*
122    * Ensure size of thread index space is as large as or greater than
123    * vector index space else return.
124    */
125   if (blockDim.x * gridDim.x < len) return;
126   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127   if (idx < len) b[idx] = scale* a[idx];
128 }
129 
130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
131 {
132   /*
133    * Ensure size of thread index space is as large as or greater than
134    * vector index space else return.
135    */
136   if (blockDim.x * gridDim.x < len) return;
137   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
138   if (idx < len) b[idx] = scale* a[idx];
139 }
140 
141 __global__ void STREAM_Add( float *a, float *b, float *c,  size_t len)
142 {
143   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
144   while (idx < len) {
145     c[idx] = a[idx]+b[idx];
146     idx += blockDim.x * gridDim.x;
147   }
148 }
149 
150 __global__ void STREAM_Add_double( double *a, double *b, double *c,  size_t len)
151 {
152   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153   while (idx < len) {
154     c[idx] = a[idx]+b[idx];
155     idx += blockDim.x * gridDim.x;
156   }
157 }
158 
159 __global__ void STREAM_Add_Optimized( float *a, float *b, float *c,  size_t len)
160 {
161   /*
162    * Ensure size of thread index space is as large as or greater than
163    * vector index space else return.
164    */
165   if (blockDim.x * gridDim.x < len) return;
166   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
167   if (idx < len) c[idx] = a[idx]+b[idx];
168 }
169 
170 __global__ void STREAM_Add_Optimized_double( double *a, double *b, double *c,  size_t len)
171 {
172   /*
173    * Ensure size of thread index space is as large as or greater than
174    * vector index space else return.
175    */
176   if (blockDim.x * gridDim.x < len) return;
177   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
178   if (idx < len) c[idx] = a[idx]+b[idx];
179 }
180 
181 __global__ void STREAM_Triad( float *a, float *b, float *c, float scalar, size_t len)
182 {
183   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
184   while (idx < len) {
185     c[idx] = a[idx]+scalar*b[idx];
186     idx += blockDim.x * gridDim.x;
187   }
188 }
189 
190 __global__ void STREAM_Triad_double( double *a, double *b, double *c, double scalar, size_t len)
191 {
192   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
193   while (idx < len) {
194     c[idx] = a[idx]+scalar*b[idx];
195     idx += blockDim.x * gridDim.x;
196   }
197 }
198 
199 __global__ void STREAM_Triad_Optimized( float *a, float *b, float *c, float scalar, size_t len)
200 {
201   /*
202    * Ensure size of thread index space is as large as or greater than
203    * vector index space else return.
204    */
205   if (blockDim.x * gridDim.x < len) return;
206   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
207   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
208 }
209 
210 __global__ void STREAM_Triad_Optimized_double( double *a, double *b, double *c, double scalar, size_t len)
211 {
212   /*
213    * Ensure size of thread index space is as large as or greater than
214    * vector index space else return.
215    */
216   if (blockDim.x * gridDim.x < len) return;
217   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
218   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
219 }
220 
221 /* Host side verification routines */
222 bool STREAM_Copy_verify(float *a, float *b, size_t len) {
223   size_t idx;
224   bool bDifferent = false;
225 
226   for (idx = 0; idx < len && !bDifferent; idx++) {
227     float expectedResult = a[idx];
228     float diffResultExpected = (b[idx] - expectedResult);
229     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230     /* element-wise relative error determination */
231     bDifferent = (relErrorULPS > 2.f);
232   }
233 
234   return bDifferent;
235 }
236 
237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) {
238   size_t idx;
239   bool bDifferent = false;
240 
241   for (idx = 0; idx < len && !bDifferent; idx++) {
242     double expectedResult = a[idx];
243     double diffResultExpected = (b[idx] - expectedResult);
244     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
245     /* element-wise relative error determination */
246     bDifferent = (relErrorULPS > 2.);
247   }
248 
249   return bDifferent;
250 }
251 
252 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
253   size_t idx;
254   bool bDifferent = false;
255 
256   for (idx = 0; idx < len && !bDifferent; idx++) {
257     float expectedResult = scale*a[idx];
258     float diffResultExpected = (b[idx] - expectedResult);
259     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
260     /* element-wise relative error determination */
261     bDifferent = (relErrorULPS > 2.f);
262   }
263 
264   return bDifferent;
265 }
266 
267 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) {
268   size_t idx;
269   bool bDifferent = false;
270 
271   for (idx = 0; idx < len && !bDifferent; idx++) {
272     double expectedResult = scale*a[idx];
273     double diffResultExpected = (b[idx] - expectedResult);
274     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
275     /* element-wise relative error determination */
276     bDifferent = (relErrorULPS > 2.);
277   }
278 
279   return bDifferent;
280 }
281 
282 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
283   size_t idx;
284   bool bDifferent = false;
285 
286   for (idx = 0; idx < len && !bDifferent; idx++) {
287     float expectedResult = a[idx] + b[idx];
288     float diffResultExpected = (c[idx] - expectedResult);
289     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
290     /* element-wise relative error determination */
291     bDifferent = (relErrorULPS > 2.f);
292   }
293 
294   return bDifferent;
295 }
296 
297 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) {
298   size_t idx;
299   bool bDifferent = false;
300 
301   for (idx = 0; idx < len && !bDifferent; idx++) {
302     double expectedResult = a[idx] + b[idx];
303     double diffResultExpected = (c[idx] - expectedResult);
304     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
305     /* element-wise relative error determination */
306     bDifferent = (relErrorULPS > 2.);
307   }
308 
309   return bDifferent;
310 }
311 
312 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
313   size_t idx;
314   bool bDifferent = false;
315 
316   for (idx = 0; idx < len && !bDifferent; idx++) {
317     float expectedResult = a[idx] + scalar*b[idx];
318     float diffResultExpected = (c[idx] - expectedResult);
319     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
320     /* element-wise relative error determination */
321     bDifferent = (relErrorULPS > 3.f);
322   }
323 
324   return bDifferent;
325 }
326 
327 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) {
328   size_t idx;
329   bool bDifferent = false;
330 
331   for (idx = 0; idx < len && !bDifferent; idx++) {
332     double expectedResult = a[idx] + scalar*b[idx];
333     double diffResultExpected = (c[idx] - expectedResult);
334     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
335     /* element-wise relative error determination */
336     bDifferent = (relErrorULPS > 3.);
337   }
338 
339   return bDifferent;
340 }
341 
342 /* forward declarations */
343 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
344 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
345 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
346 PetscErrorCode printResultsReadable(float times[][NTIMES]);
347 
348 int main(int argc, char *argv[])
349 {
350   PetscInt       device    = 0;
351   PetscBool      runDouble = PETSC_FALSE;
352   PetscBool      cpuTiming = PETSC_FALSE;
353   PetscErrorCode ierr;
354 
355   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
356   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
357   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
358 
359   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
360     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
361     ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr);
362     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
363   ierr = PetscOptionsEnd();
364 
365   ierr = setupStream(device, runDouble, cpuTiming);
366   if (ierr >= 0) {
367     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
368   }
369   PetscFinalize();
370   return 0;
371 }
372 
373 ///////////////////////////////////////////////////////////////////////////////
374 //Run the appropriate tests
375 ///////////////////////////////////////////////////////////////////////////////
376 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
377 {
378   PetscInt       iNumThreadsPerBlock = 128;
379   PetscErrorCode ierr;
380 
381   PetscFunctionBegin;
382   // Check device
383   {
384     int deviceCount;
385 
386     cudaGetDeviceCount(&deviceCount);
387     if (deviceCount == 0) {
388       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
389       return -1000;
390     }
391 
392     if (deviceNum >= deviceCount || deviceNum < 0) {
393       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
394       deviceNum = 0;
395     }
396   }
397 
398   cudaSetDevice(deviceNum);
399   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
400   cudaDeviceProp deviceProp;
401   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
402     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
403   } else {
404     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
405     return -1;
406   }
407 
408   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
409     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
410     return -1;
411   }
412   if (deviceProp.major == 2 && deviceProp.minor == 1) {
413     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
414   } else {
415     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
416   }
417 
418   if (cpuTiming) {
419     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
420   }
421 
422   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
423   if (runDouble) {
424     ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
425     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
426   }
427   PetscFunctionReturn(0);
428 }
429 
430 ///////////////////////////////////////////////////////////////////////////
431 // runStream
432 ///////////////////////////////////////////////////////////////////////////
433 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
434 {
435   float *d_a, *d_b, *d_c;
436   int k;
437   float times[8][NTIMES];
438   float scalar;
439   PetscErrorCode ierr;
440 
441   PetscFunctionBegin;
442   /* Allocate memory on device */
443   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
444   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
445   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
446 
447   /* Compute execution configuration */
448 
449   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
450   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
451   if (N % dimBlock.x != 0) dimGrid.x+=1;
452 
453   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
454   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
455 
456   /* Initialize memory on the device */
457   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
458   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
459   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
460 
461   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
462   PetscLogDouble cpuTimer = 0.0;
463   cudaEvent_t start, stop;
464 
465   /* both timers report msec */
466   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
467   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
468 
469   scalar=3.0f;
470   for (k = 0; k < NTIMES; ++k) {
471     PetscTimeSubtract(cpuTimer);
472     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
473     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
474     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
475     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
476     //get the the total elapsed time in ms
477     PetscTimeAdd(cpuTimer);
478     if (bDontUseGPUTiming) {
479       times[0][k] = cpuTimer;
480     } else {
481       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
482     }
483 
484     cpuTimer = 0.0;
485     PetscTimeSubtract(cpuTimer);
486     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
487     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
488     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
489     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
490     //get the the total elapsed time in ms
491     PetscTimeAdd(cpuTimer);
492     if (bDontUseGPUTiming) {
493       times[1][k] = cpuTimer;
494     } else {
495       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
496     }
497 
498     cpuTimer = 0.0;
499     PetscTimeSubtract(cpuTimer);
500     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
501     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
502     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
503     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
504     //get the the total elapsed time in ms
505     PetscTimeAdd(cpuTimer);
506     if (bDontUseGPUTiming) {
507       times[2][k] = cpuTimer;
508     } else {
509       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
510     }
511 
512     cpuTimer = 0.0;
513     PetscTimeSubtract(cpuTimer);
514     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
515     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
516     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
517     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
518     //get the the total elapsed time in ms
519     PetscTimeAdd(cpuTimer);
520     if (bDontUseGPUTiming) {
521       times[3][k] = cpuTimer;
522     } else {
523       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
524     }
525 
526     cpuTimer = 0.0;
527     PetscTimeSubtract(cpuTimer);
528     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
529     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
530     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
531     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
532     //get the the total elapsed time in ms
533     PetscTimeAdd(cpuTimer);
534     if (bDontUseGPUTiming) {
535       times[4][k] = cpuTimer;
536     } else {
537       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
538     }
539 
540     cpuTimer = 0.0;
541     PetscTimeSubtract(cpuTimer);
542     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
543     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
544     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
545     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
546     //get the the total elapsed time in ms
547     PetscTimeAdd(cpuTimer);
548     if (bDontUseGPUTiming) {
549       times[5][k] = cpuTimer;
550     } else {
551       ierr = cudaEventElapsedTime( &times[5][k], start, stop );CHKERRQ(ierr);
552     }
553 
554     cpuTimer = 0.0;
555     PetscTimeSubtract(cpuTimer);
556     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
557     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
558     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
559     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
560     //get the the total elapsed time in ms
561     PetscTimeAdd(cpuTimer);
562     if (bDontUseGPUTiming) {
563       times[6][k] = cpuTimer;
564     } else {
565       ierr = cudaEventElapsedTime( &times[6][k], start, stop );CHKERRQ(ierr);
566     }
567 
568     cpuTimer = 0.0;
569     PetscTimeSubtract(cpuTimer);
570     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
571     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
572     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
573     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
574     //get the the total elapsed time in ms
575     PetscTimeAdd(cpuTimer);
576     if (bDontUseGPUTiming) {
577       times[7][k] = cpuTimer;
578     } else {
579       ierr = cudaEventElapsedTime( &times[7][k], start, stop );CHKERRQ(ierr);
580     }
581 
582   }
583 
584   /* verify kernels */
585   float *h_a, *h_b, *h_c;
586   bool errorSTREAMkernel = true;
587 
588   if ( (h_a = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
589     printf("Unable to allocate array h_a, exiting ...\n");
590     exit(1);
591   }
592   if ( (h_b = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
593     printf("Unable to allocate array h_b, exiting ...\n");
594     exit(1);
595   }
596 
597   if ( (h_c = (float*)calloc( N, sizeof(float) )) == (float*)NULL ) {
598     printf("Unalbe to allocate array h_c, exiting ...\n");
599     exit(1);
600   }
601 
602   /*
603    * perform kernel, copy device memory into host memory and verify each
604    * device kernel output
605    */
606 
607   /* Initialize memory on the device */
608   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
609   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
610   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
611 
612   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
613   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
614   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
615   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
616   if (errorSTREAMkernel) {
617     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
618     exit(-2000);
619   } else {
620     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
621   }
622 
623   /* Initialize memory on the device */
624   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
625   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
626   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
627 
628   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
629   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
630   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
631   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
632   if (errorSTREAMkernel) {
633     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
634     exit(-3000);
635   } else {
636     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
637   }
638 
639   /* Initialize memory on the device */
640   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
641   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
642 
643   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
644   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
645   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
646   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
647   if (errorSTREAMkernel) {
648     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
649     exit(-4000);
650   } else {
651     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
652   }
653 
654   /* Initialize memory on the device */
655   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
656   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
657   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
658 
659   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
660   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
661   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
662   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
663   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
664   if (errorSTREAMkernel) {
665     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
666     exit(-5000);
667   } else {
668     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
669   }
670 
671   /* Initialize memory on the device */
672   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
673   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
674   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
675 
676   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
677   ierr = cudaMemcpy( h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
678   ierr = cudaMemcpy( h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
679   ierr = cudaMemcpy( h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
680   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
681   if (errorSTREAMkernel) {
682     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
683     exit(-6000);
684   } else {
685     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
686   }
687 
688   /* continue from here */
689   printResultsReadable(times);
690 
691   //clean up timers
692   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
693   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
694 
695   /* Free memory on device */
696   ierr = cudaFree(d_a);CHKERRQ(ierr);
697   ierr = cudaFree(d_b);CHKERRQ(ierr);
698   ierr = cudaFree(d_c);CHKERRQ(ierr);
699   PetscFunctionReturn(0);
700 }
701 
702 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
703 {
704   double *d_a, *d_b, *d_c;
705   int k;
706   float times[8][NTIMES];
707   double scalar;
708   PetscErrorCode ierr;
709 
710   PetscFunctionBegin;
711   /* Allocate memory on device */
712   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
713   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
714   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
715 
716   /* Compute execution configuration */
717 
718   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
719   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
720   if (N % dimBlock.x != 0) dimGrid.x+=1;
721 
722   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr);
723   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
724 
725   /* Initialize memory on the device */
726   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
727   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
728   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
729 
730   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
731   PetscLogDouble cpuTimer = 0.0;
732   cudaEvent_t start, stop;
733 
734   /* both timers report msec */
735   ierr = cudaEventCreate( &start );CHKERRQ(ierr); /* gpu timer facility */
736   ierr = cudaEventCreate( &stop );CHKERRQ(ierr);  /* gpu timer facility */
737 
738   scalar=3.0;
739   for (k = 0; k < NTIMES; ++k) {
740     PetscTimeSubtract(cpuTimer);
741     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
742     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
743     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
744     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
745     //get the the total elapsed time in ms
746     if (bDontUseGPUTiming) {
747       PetscTimeAdd(cpuTimer);
748       times[0][k] = cpuTimer;
749     } else {
750       ierr = cudaEventElapsedTime( &times[0][k], start, stop );CHKERRQ(ierr);
751     }
752 
753     cpuTimer = 0.0;
754     PetscTimeSubtract(cpuTimer);
755     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
756     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
757     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
758     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
759     //get the the total elapsed time in ms
760     if (bDontUseGPUTiming) {
761       PetscTimeAdd(cpuTimer);
762       times[1][k] = cpuTimer;
763     } else {
764       ierr = cudaEventElapsedTime( &times[1][k], start, stop );CHKERRQ(ierr);
765     }
766 
767     cpuTimer = 0.0;
768     PetscTimeSubtract(cpuTimer);
769     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
770     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
771     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
772     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
773     //get the the total elapsed time in ms
774     PetscTimeAdd(cpuTimer);
775     if (bDontUseGPUTiming) {
776       times[2][k] = cpuTimer;
777     } else {
778       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
779     }
780 
781     cpuTimer = 0.0;
782     PetscTimeSubtract(cpuTimer);
783     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
784     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
785     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
786     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
787     //get the the total elapsed time in ms
788     PetscTimeAdd(cpuTimer);
789     if (bDontUseGPUTiming) {
790       times[3][k] = cpuTimer;
791     } else {
792       ierr = cudaEventElapsedTime( &times[2][k], start, stop );CHKERRQ(ierr);
793     }
794 
795     cpuTimer = 0.0;
796     PetscTimeSubtract(cpuTimer);
797     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
798     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
799     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
800     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
801     //get the the total elapsed time in ms
802     PetscTimeAdd(cpuTimer);
803     if (bDontUseGPUTiming) {
804       times[4][k] = cpuTimer;
805     } else {
806       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
807     }
808 
809     cpuTimer = 0.0;
810     PetscTimeSubtract(cpuTimer);
811     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
812     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
813     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
814     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
815     //get the the total elapsed time in ms
816     PetscTimeAdd(cpuTimer);
817     if (bDontUseGPUTiming) {
818       times[5][k] = cpuTimer;
819     } else {
820       ierr = cudaEventElapsedTime( &times[3][k], start, stop );CHKERRQ(ierr);
821     }
822 
823     cpuTimer = 0.0;
824     PetscTimeSubtract(cpuTimer);
825     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
826     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
827     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
828     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
829     //get the the total elapsed time in ms
830     PetscTimeAdd(cpuTimer);
831     if (bDontUseGPUTiming) {
832       times[6][k] = cpuTimer;
833     } else {
834       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
835     }
836 
837     cpuTimer = 0.0;
838     PetscTimeSubtract(cpuTimer);
839     ierr = cudaEventRecord( start, 0 );CHKERRQ(ierr);
840     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
841     ierr = cudaEventRecord( stop, 0 );CHKERRQ(ierr);
842     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
843     //get the the total elapsed time in ms
844     PetscTimeAdd(cpuTimer);
845     if (bDontUseGPUTiming) {
846       times[7][k] = cpuTimer;
847     } else {
848       ierr = cudaEventElapsedTime( &times[4][k], start, stop );CHKERRQ(ierr);
849     }
850 
851   }
852 
853   /* verify kernels */
854   double *h_a, *h_b, *h_c;
855   bool errorSTREAMkernel = true;
856 
857   if ( (h_a = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
858     printf("Unable to allocate array h_a, exiting ...\n");
859     exit(1);
860   }
861   if ( (h_b = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
862     printf("Unable to allocate array h_b, exiting ...\n");
863     exit(1);
864   }
865 
866   if ( (h_c = (double*)calloc( N, sizeof(double) )) == (double*)NULL ) {
867     printf("Unalbe to allocate array h_c, exiting ...\n");
868     exit(1);
869   }
870 
871   /*
872    * perform kernel, copy device memory into host memory and verify each
873    * device kernel output
874    */
875 
876   /* Initialize memory on the device */
877   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
878   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
879   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
880 
881   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
882   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
883   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
884   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
885   if (errorSTREAMkernel) {
886     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
887     exit(-2000);
888   } else {
889     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
890   }
891 
892   /* Initialize memory on the device */
893   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
894   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
895   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
896 
897   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
898   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
899   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
900   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
901   if (errorSTREAMkernel) {
902     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
903     exit(-3000);
904   } else {
905     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
906   }
907 
908   /* Initialize memory on the device */
909   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
910   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
911 
912   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
913   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
914   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
915   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
916   if (errorSTREAMkernel) {
917     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
918     exit(-4000);
919   } else {
920     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
921   }
922 
923   /* Initialize memory on the device */
924   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
925   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
926   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
927 
928   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
929   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
930   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
931   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
932   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
933   if (errorSTREAMkernel) {
934     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
935     exit(-5000);
936   } else {
937     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
938   }
939 
940   /* Initialize memory on the device */
941   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
942   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
943   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
944 
945   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
946   ierr = cudaMemcpy( h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
947   ierr = cudaMemcpy( h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
948   ierr = cudaMemcpy( h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
949   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
950   if (errorSTREAMkernel) {
951     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
952     exit(-6000);
953   } else {
954     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
955   }
956 
957   /* continue from here */
958   printResultsReadable(times);
959 
960   //clean up timers
961   ierr = cudaEventDestroy( stop );CHKERRQ(ierr);
962   ierr = cudaEventDestroy( start );CHKERRQ(ierr);
963 
964   /* Free memory on device */
965   ierr = cudaFree(d_a);CHKERRQ(ierr);
966   ierr = cudaFree(d_b);CHKERRQ(ierr);
967   ierr = cudaFree(d_c);CHKERRQ(ierr);
968   PetscFunctionReturn(0);
969 }
970 
971 ///////////////////////////////////////////////////////////////////////////
972 //Print Results to Screen and File
973 ///////////////////////////////////////////////////////////////////////////
974 PetscErrorCode printResultsReadable(float times[][NTIMES]) {
975   PetscErrorCode ierr;
976   PetscInt       j, k;
977   float          avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
978   float          maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
979   float          mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
980   char *label[8]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
981   float          bytes_per_kernel[8] = {
982     2. * sizeof(float) * N,
983     2. * sizeof(float) * N,
984     2. * sizeof(float) * N,
985     2. * sizeof(float) * N,
986     3. * sizeof(float) * N,
987     3. * sizeof(float) * N,
988     3. * sizeof(float) * N,
989     3. * sizeof(float) * N
990   };
991 
992   PetscFunctionBegin;
993   /* --- SUMMARY --- */
994   for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
995     for (j = 0; j < 8; ++j) {
996       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
997       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
998       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
999     }
1000   }
1001 
1002   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
1003 
1004   for (j = 0; j < 8; ++j) {
1005      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
1006      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
1007   }
1008   PetscFunctionReturn(0);
1009 }
1010