xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 5c8f6a953e7ed1c81f507d64aebddb11080b60e9)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in single precision.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char *help =  "Single-Precision STREAM Benchmark implementation in CUDA\n"
23                      "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24 
25 #include <petscconf.h>
26 #include <petscsys.h>
27 #include <petsctime.h>
28 
29 #define N        2000000
30 #define N_DOUBLE 8000000
31 #define NTIMES   10
32 
33 # ifndef MIN
34 # define MIN(x,y) ((x)<(y)?(x):(y))
35 # endif
36 # ifndef MAX
37 # define MAX(x,y) ((x)>(y)?(x):(y))
38 # endif
39 
40 const float flt_eps  = 1.192092896e-07f;
41 const double dbl_eps = 2.2204460492503131e-16;
42 
43 __global__ void set_array(float *a,  float value, size_t len)
44 {
45   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
46   while (idx < len) {
47     a[idx] = value;
48     idx += blockDim.x * gridDim.x;
49   }
50 }
51 
52 __global__ void set_array_double(double *a,  double value, size_t len)
53 {
54   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
55   while (idx < len) {
56     a[idx] = value;
57     idx += blockDim.x * gridDim.x;
58   }
59 }
60 
61 __global__ void STREAM_Copy(float *a, float *b, size_t len)
62 {
63   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
64   while (idx < len) {
65     b[idx] = a[idx];
66     idx += blockDim.x * gridDim.x;
67   }
68 }
69 
70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
71 {
72   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73   while (idx < len) {
74     b[idx] = a[idx];
75     idx += blockDim.x * gridDim.x;
76   }
77 }
78 
79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
80 {
81   /*
82    * Ensure size of thread index space is as large as or greater than
83    * vector index space else return.
84    */
85   if (blockDim.x * gridDim.x < len) return;
86   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
87   if (idx < len) b[idx] = a[idx];
88 }
89 
90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
91 {
92   /*
93    * Ensure size of thread index space is as large as or greater than
94    * vector index space else return.
95    */
96   if (blockDim.x * gridDim.x < len) return;
97   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
98   if (idx < len) b[idx] = a[idx];
99 }
100 
101 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
102 {
103   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104   while (idx < len) {
105     b[idx] = scale* a[idx];
106     idx += blockDim.x * gridDim.x;
107   }
108 }
109 
110 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
111 {
112   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
113   while (idx < len) {
114     b[idx] = scale* a[idx];
115     idx += blockDim.x * gridDim.x;
116   }
117 }
118 
119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
120 {
121   /*
122    * Ensure size of thread index space is as large as or greater than
123    * vector index space else return.
124    */
125   if (blockDim.x * gridDim.x < len) return;
126   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127   if (idx < len) b[idx] = scale* a[idx];
128 }
129 
130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
131 {
132   /*
133    * Ensure size of thread index space is as large as or greater than
134    * vector index space else return.
135    */
136   if (blockDim.x * gridDim.x < len) return;
137   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
138   if (idx < len) b[idx] = scale* a[idx];
139 }
140 
141 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
142 {
143   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
144   while (idx < len) {
145     c[idx] = a[idx]+b[idx];
146     idx += blockDim.x * gridDim.x;
147   }
148 }
149 
150 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
151 {
152   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153   while (idx < len) {
154     c[idx] = a[idx]+b[idx];
155     idx += blockDim.x * gridDim.x;
156   }
157 }
158 
159 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
160 {
161   /*
162    * Ensure size of thread index space is as large as or greater than
163    * vector index space else return.
164    */
165   if (blockDim.x * gridDim.x < len) return;
166   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
167   if (idx < len) c[idx] = a[idx]+b[idx];
168 }
169 
170 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
171 {
172   /*
173    * Ensure size of thread index space is as large as or greater than
174    * vector index space else return.
175    */
176   if (blockDim.x * gridDim.x < len) return;
177   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
178   if (idx < len) c[idx] = a[idx]+b[idx];
179 }
180 
181 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
182 {
183   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
184   while (idx < len) {
185     c[idx] = a[idx]+scalar*b[idx];
186     idx += blockDim.x * gridDim.x;
187   }
188 }
189 
190 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
191 {
192   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
193   while (idx < len) {
194     c[idx] = a[idx]+scalar*b[idx];
195     idx += blockDim.x * gridDim.x;
196   }
197 }
198 
199 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
200 {
201   /*
202    * Ensure size of thread index space is as large as or greater than
203    * vector index space else return.
204    */
205   if (blockDim.x * gridDim.x < len) return;
206   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
207   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
208 }
209 
210 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
211 {
212   /*
213    * Ensure size of thread index space is as large as or greater than
214    * vector index space else return.
215    */
216   if (blockDim.x * gridDim.x < len) return;
217   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
218   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
219 }
220 
221 /* Host side verification routines */
222 bool STREAM_Copy_verify(float *a, float *b, size_t len)
223 {
224   size_t idx;
225   bool bDifferent = false;
226 
227   for (idx = 0; idx < len && !bDifferent; idx++) {
228     float expectedResult = a[idx];
229     float diffResultExpected = (b[idx] - expectedResult);
230     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
231     /* element-wise relative error determination */
232     bDifferent = (relErrorULPS > 2.f);
233   }
234 
235   return bDifferent;
236 }
237 
238 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
239 {
240   size_t idx;
241   bool bDifferent = false;
242 
243   for (idx = 0; idx < len && !bDifferent; idx++) {
244     double expectedResult = a[idx];
245     double diffResultExpected = (b[idx] - expectedResult);
246     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
247     /* element-wise relative error determination */
248     bDifferent = (relErrorULPS > 2.);
249   }
250 
251   return bDifferent;
252 }
253 
254 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
255 {
256   size_t idx;
257   bool bDifferent = false;
258 
259   for (idx = 0; idx < len && !bDifferent; idx++) {
260     float expectedResult = scale*a[idx];
261     float diffResultExpected = (b[idx] - expectedResult);
262     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
263     /* element-wise relative error determination */
264     bDifferent = (relErrorULPS > 2.f);
265   }
266 
267   return bDifferent;
268 }
269 
270 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
271 {
272   size_t idx;
273   bool bDifferent = false;
274 
275   for (idx = 0; idx < len && !bDifferent; idx++) {
276     double expectedResult = scale*a[idx];
277     double diffResultExpected = (b[idx] - expectedResult);
278     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
279     /* element-wise relative error determination */
280     bDifferent = (relErrorULPS > 2.);
281   }
282 
283   return bDifferent;
284 }
285 
286 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
287 {
288   size_t idx;
289   bool bDifferent = false;
290 
291   for (idx = 0; idx < len && !bDifferent; idx++) {
292     float expectedResult = a[idx] + b[idx];
293     float diffResultExpected = (c[idx] - expectedResult);
294     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
295     /* element-wise relative error determination */
296     bDifferent = (relErrorULPS > 2.f);
297   }
298 
299   return bDifferent;
300 }
301 
302 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
303 {
304   size_t idx;
305   bool bDifferent = false;
306 
307   for (idx = 0; idx < len && !bDifferent; idx++) {
308     double expectedResult = a[idx] + b[idx];
309     double diffResultExpected = (c[idx] - expectedResult);
310     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
311     /* element-wise relative error determination */
312     bDifferent = (relErrorULPS > 2.);
313   }
314 
315   return bDifferent;
316 }
317 
318 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
319 {
320   size_t idx;
321   bool bDifferent = false;
322 
323   for (idx = 0; idx < len && !bDifferent; idx++) {
324     float expectedResult = a[idx] + scalar*b[idx];
325     float diffResultExpected = (c[idx] - expectedResult);
326     float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
327     /* element-wise relative error determination */
328     bDifferent = (relErrorULPS > 3.f);
329   }
330 
331   return bDifferent;
332 }
333 
334 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
335 {
336   size_t idx;
337   bool bDifferent = false;
338 
339   for (idx = 0; idx < len && !bDifferent; idx++) {
340     double expectedResult = a[idx] + scalar*b[idx];
341     double diffResultExpected = (c[idx] - expectedResult);
342     double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
343     /* element-wise relative error determination */
344     bDifferent = (relErrorULPS > 3.);
345   }
346 
347   return bDifferent;
348 }
349 
350 /* forward declarations */
351 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
352 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
354 PetscErrorCode printResultsReadable(float times[][NTIMES]);
355 
356 int main(int argc, char *argv[])
357 {
358   PetscInt       device    = 0;
359   PetscBool      runDouble = PETSC_FALSE;
360   PetscBool      cpuTiming = PETSC_FALSE;
361   PetscErrorCode ierr;
362 
363   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
364   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
365   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
366 
367   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
368     ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, PETSC_NULL);CHKERRQ(ierr);
369     ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, PETSC_NULL);CHKERRQ(ierr);
370     ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, PETSC_NULL);CHKERRQ(ierr);
371   ierr = PetscOptionsEnd();
372 
373   ierr = setupStream(device, runDouble, cpuTiming);
374   if (ierr >= 0) {
375     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
376   }
377   PetscFinalize();
378   return 0;
379 }
380 
381 ///////////////////////////////////////////////////////////////////////////////
382 //Run the appropriate tests
383 ///////////////////////////////////////////////////////////////////////////////
384 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
385 {
386   PetscInt       iNumThreadsPerBlock = 128;
387   PetscErrorCode ierr;
388 
389   PetscFunctionBegin;
390   // Check device
391   {
392     int deviceCount;
393 
394     cudaGetDeviceCount(&deviceCount);
395     if (deviceCount == 0) {
396       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
397       return -1000;
398     }
399 
400     if (deviceNum >= deviceCount || deviceNum < 0) {
401       ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
402       deviceNum = 0;
403     }
404   }
405 
406   cudaSetDevice(deviceNum);
407   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
408   cudaDeviceProp deviceProp;
409   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
410     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
411   } else {
412     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
413     return -1;
414   }
415 
416   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
417     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
418     return -1;
419   }
420   if (deviceProp.major == 2 && deviceProp.minor == 1) {
421     iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
422   } else {
423     iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
424   }
425 
426   if (cpuTiming) {
427     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
428   }
429 
430   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
431   if (runDouble) {
432     ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
433     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
434   }
435   PetscFunctionReturn(0);
436 }
437 
438 ///////////////////////////////////////////////////////////////////////////
439 // runStream
440 ///////////////////////////////////////////////////////////////////////////
441 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
442 {
443   float *d_a, *d_b, *d_c;
444   int k;
445   float times[8][NTIMES];
446   float scalar;
447   PetscErrorCode ierr;
448 
449   PetscFunctionBegin;
450   /* Allocate memory on device */
451   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
452   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
453   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
454 
455   /* Compute execution configuration */
456 
457   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
458   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
459   if (N % dimBlock.x != 0) dimGrid.x+=1;
460 
461   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
462   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
463 
464   /* Initialize memory on the device */
465   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
466   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
467   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
468 
469   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
470   PetscLogDouble cpuTimer = 0.0;
471   cudaEvent_t start, stop;
472 
473   /* both timers report msec */
474   ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */
475   ierr = cudaEventCreate(&stop);CHKERRQ(ierr);  /* gpu timer facility */
476 
477   scalar=3.0f;
478   for (k = 0; k < NTIMES; ++k) {
479     PetscTimeSubtract(cpuTimer);
480     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
481     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
482     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
483     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
484     //get the the total elapsed time in ms
485     PetscTimeAdd(cpuTimer);
486     if (bDontUseGPUTiming) {
487       times[0][k] = cpuTimer;
488     } else {
489       ierr = cudaEventElapsedTime(&times[0][k], start, stop);CHKERRQ(ierr);
490     }
491 
492     cpuTimer = 0.0;
493     PetscTimeSubtract(cpuTimer);
494     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
495     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
496     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
497     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
498     //get the the total elapsed time in ms
499     PetscTimeAdd(cpuTimer);
500     if (bDontUseGPUTiming) {
501       times[1][k] = cpuTimer;
502     } else {
503       ierr = cudaEventElapsedTime(&times[1][k], start, stop);CHKERRQ(ierr);
504     }
505 
506     cpuTimer = 0.0;
507     PetscTimeSubtract(cpuTimer);
508     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
509     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
510     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
511     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
512     //get the the total elapsed time in ms
513     PetscTimeAdd(cpuTimer);
514     if (bDontUseGPUTiming) {
515       times[2][k] = cpuTimer;
516     } else {
517       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
518     }
519 
520     cpuTimer = 0.0;
521     PetscTimeSubtract(cpuTimer);
522     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
523     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
524     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
525     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
526     //get the the total elapsed time in ms
527     PetscTimeAdd(cpuTimer);
528     if (bDontUseGPUTiming) {
529       times[3][k] = cpuTimer;
530     } else {
531       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
532     }
533 
534     cpuTimer = 0.0;
535     PetscTimeSubtract(cpuTimer);
536     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
537     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
538     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
539     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
540     //get the the total elapsed time in ms
541     PetscTimeAdd(cpuTimer);
542     if (bDontUseGPUTiming) {
543       times[4][k] = cpuTimer;
544     } else {
545       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
546     }
547 
548     cpuTimer = 0.0;
549     PetscTimeSubtract(cpuTimer);
550     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
551     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
552     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
553     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
554     //get the the total elapsed time in ms
555     PetscTimeAdd(cpuTimer);
556     if (bDontUseGPUTiming) {
557       times[5][k] = cpuTimer;
558     } else {
559       ierr = cudaEventElapsedTime(&times[5][k], start, stop);CHKERRQ(ierr);
560     }
561 
562     cpuTimer = 0.0;
563     PetscTimeSubtract(cpuTimer);
564     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
565     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
566     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
567     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
568     //get the the total elapsed time in ms
569     PetscTimeAdd(cpuTimer);
570     if (bDontUseGPUTiming) {
571       times[6][k] = cpuTimer;
572     } else {
573       ierr = cudaEventElapsedTime(&times[6][k], start, stop);CHKERRQ(ierr);
574     }
575 
576     cpuTimer = 0.0;
577     PetscTimeSubtract(cpuTimer);
578     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
579     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
580     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
581     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
582     //get the the total elapsed time in ms
583     PetscTimeAdd(cpuTimer);
584     if (bDontUseGPUTiming) {
585       times[7][k] = cpuTimer;
586     } else {
587       ierr = cudaEventElapsedTime(&times[7][k], start, stop);CHKERRQ(ierr);
588     }
589 
590   }
591 
592   /* verify kernels */
593   float *h_a, *h_b, *h_c;
594   bool errorSTREAMkernel = true;
595 
596   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
597     printf("Unable to allocate array h_a, exiting ...\n");
598     exit(1);
599   }
600   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
601     printf("Unable to allocate array h_b, exiting ...\n");
602     exit(1);
603   }
604 
605   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
606     printf("Unalbe to allocate array h_c, exiting ...\n");
607     exit(1);
608   }
609 
610   /*
611    * perform kernel, copy device memory into host memory and verify each
612    * device kernel output
613    */
614 
615   /* Initialize memory on the device */
616   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
617   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
618   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
619 
620   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
621   ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
622   ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
623   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
624   if (errorSTREAMkernel) {
625     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
626     exit(-2000);
627   } else {
628     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
629   }
630 
631   /* Initialize memory on the device */
632   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
633   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
634   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
635 
636   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
637   ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
638   ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
639   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
640   if (errorSTREAMkernel) {
641     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
642     exit(-3000);
643   } else {
644     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
645   }
646 
647   /* Initialize memory on the device */
648   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
649   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
650 
651   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
652   ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
653   ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
654   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
655   if (errorSTREAMkernel) {
656     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
657     exit(-4000);
658   } else {
659     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
660   }
661 
662   /* Initialize memory on the device */
663   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
664   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
665   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
666 
667   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
668   ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
669   ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
670   ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
671   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
672   if (errorSTREAMkernel) {
673     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
674     exit(-5000);
675   } else {
676     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
677   }
678 
679   /* Initialize memory on the device */
680   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
681   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
682   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
683 
684   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
685   ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
686   ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
687   ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
688   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
689   if (errorSTREAMkernel) {
690     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
691     exit(-6000);
692   } else {
693     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
694   }
695 
696   /* continue from here */
697   printResultsReadable(times);
698 
699   //clean up timers
700   ierr = cudaEventDestroy(stop);CHKERRQ(ierr);
701   ierr = cudaEventDestroy(start);CHKERRQ(ierr);
702 
703   /* Free memory on device */
704   ierr = cudaFree(d_a);CHKERRQ(ierr);
705   ierr = cudaFree(d_b);CHKERRQ(ierr);
706   ierr = cudaFree(d_c);CHKERRQ(ierr);
707   PetscFunctionReturn(0);
708 }
709 
710 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
711 {
712   double *d_a, *d_b, *d_c;
713   int k;
714   float times[8][NTIMES];
715   double scalar;
716   PetscErrorCode ierr;
717 
718   PetscFunctionBegin;
719   /* Allocate memory on device */
720   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
721   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
722   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
723 
724   /* Compute execution configuration */
725 
726   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
727   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
728   if (N % dimBlock.x != 0) dimGrid.x+=1;
729 
730   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr);
731   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
732 
733   /* Initialize memory on the device */
734   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
735   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
736   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
737 
738   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
739   PetscLogDouble cpuTimer = 0.0;
740   cudaEvent_t start, stop;
741 
742   /* both timers report msec */
743   ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */
744   ierr = cudaEventCreate(&stop);CHKERRQ(ierr);  /* gpu timer facility */
745 
746   scalar=3.0;
747   for (k = 0; k < NTIMES; ++k) {
748     PetscTimeSubtract(cpuTimer);
749     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
750     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
751     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
752     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
753     //get the the total elapsed time in ms
754     if (bDontUseGPUTiming) {
755       PetscTimeAdd(cpuTimer);
756       times[0][k] = cpuTimer;
757     } else {
758       ierr = cudaEventElapsedTime(&times[0][k], start, stop);CHKERRQ(ierr);
759     }
760 
761     cpuTimer = 0.0;
762     PetscTimeSubtract(cpuTimer);
763     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
764     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
765     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
766     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
767     //get the the total elapsed time in ms
768     if (bDontUseGPUTiming) {
769       PetscTimeAdd(cpuTimer);
770       times[1][k] = cpuTimer;
771     } else {
772       ierr = cudaEventElapsedTime(&times[1][k], start, stop);CHKERRQ(ierr);
773     }
774 
775     cpuTimer = 0.0;
776     PetscTimeSubtract(cpuTimer);
777     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
778     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
779     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
780     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
781     //get the the total elapsed time in ms
782     PetscTimeAdd(cpuTimer);
783     if (bDontUseGPUTiming) {
784       times[2][k] = cpuTimer;
785     } else {
786       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
787     }
788 
789     cpuTimer = 0.0;
790     PetscTimeSubtract(cpuTimer);
791     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
792     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
793     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
794     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
795     //get the the total elapsed time in ms
796     PetscTimeAdd(cpuTimer);
797     if (bDontUseGPUTiming) {
798       times[3][k] = cpuTimer;
799     } else {
800       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
801     }
802 
803     cpuTimer = 0.0;
804     PetscTimeSubtract(cpuTimer);
805     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
806     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
807     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
808     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
809     //get the the total elapsed time in ms
810     PetscTimeAdd(cpuTimer);
811     if (bDontUseGPUTiming) {
812       times[4][k] = cpuTimer;
813     } else {
814       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
815     }
816 
817     cpuTimer = 0.0;
818     PetscTimeSubtract(cpuTimer);
819     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
820     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
821     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
822     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
823     //get the the total elapsed time in ms
824     PetscTimeAdd(cpuTimer);
825     if (bDontUseGPUTiming) {
826       times[5][k] = cpuTimer;
827     } else {
828       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
829     }
830 
831     cpuTimer = 0.0;
832     PetscTimeSubtract(cpuTimer);
833     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
834     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
835     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
836     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
837     //get the the total elapsed time in ms
838     PetscTimeAdd(cpuTimer);
839     if (bDontUseGPUTiming) {
840       times[6][k] = cpuTimer;
841     } else {
842       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
843     }
844 
845     cpuTimer = 0.0;
846     PetscTimeSubtract(cpuTimer);
847     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
848     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
849     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
850     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
851     //get the the total elapsed time in ms
852     PetscTimeAdd(cpuTimer);
853     if (bDontUseGPUTiming) {
854       times[7][k] = cpuTimer;
855     } else {
856       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
857     }
858 
859   }
860 
861   /* verify kernels */
862   double *h_a, *h_b, *h_c;
863   bool errorSTREAMkernel = true;
864 
865   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
866     printf("Unable to allocate array h_a, exiting ...\n");
867     exit(1);
868   }
869   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
870     printf("Unable to allocate array h_b, exiting ...\n");
871     exit(1);
872   }
873 
874   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
875     printf("Unalbe to allocate array h_c, exiting ...\n");
876     exit(1);
877   }
878 
879   /*
880    * perform kernel, copy device memory into host memory and verify each
881    * device kernel output
882    */
883 
884   /* Initialize memory on the device */
885   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
886   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
887   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
888 
889   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
890   ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
891   ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
892   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
893   if (errorSTREAMkernel) {
894     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
895     exit(-2000);
896   } else {
897     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
898   }
899 
900   /* Initialize memory on the device */
901   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
902   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
903   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
904 
905   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
906   ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
907   ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
908   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
909   if (errorSTREAMkernel) {
910     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
911     exit(-3000);
912   } else {
913     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
914   }
915 
916   /* Initialize memory on the device */
917   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
918   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
919 
920   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
921   ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
922   ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
923   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
924   if (errorSTREAMkernel) {
925     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
926     exit(-4000);
927   } else {
928     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
929   }
930 
931   /* Initialize memory on the device */
932   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
933   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
934   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
935 
936   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
937   ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
938   ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
939   ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
940   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
941   if (errorSTREAMkernel) {
942     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
943     exit(-5000);
944   } else {
945     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
946   }
947 
948   /* Initialize memory on the device */
949   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
950   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
951   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
952 
953   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
954   ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
955   ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
956   ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
957   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
958   if (errorSTREAMkernel) {
959     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
960     exit(-6000);
961   } else {
962     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
963   }
964 
965   /* continue from here */
966   printResultsReadable(times);
967 
968   //clean up timers
969   ierr = cudaEventDestroy(stop);CHKERRQ(ierr);
970   ierr = cudaEventDestroy(start);CHKERRQ(ierr);
971 
972   /* Free memory on device */
973   ierr = cudaFree(d_a);CHKERRQ(ierr);
974   ierr = cudaFree(d_b);CHKERRQ(ierr);
975   ierr = cudaFree(d_c);CHKERRQ(ierr);
976   PetscFunctionReturn(0);
977 }
978 
979 ///////////////////////////////////////////////////////////////////////////
980 //Print Results to Screen and File
981 ///////////////////////////////////////////////////////////////////////////
982 PetscErrorCode printResultsReadable(float times[][NTIMES])
983 {
984   PetscErrorCode ierr;
985   PetscInt       j, k;
986   float          avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
987   float          maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
988   float          mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
989   char *label[8]   = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
990   float          bytes_per_kernel[8] = {
991     2. * sizeof(float) * N,
992     2. * sizeof(float) * N,
993     2. * sizeof(float) * N,
994     2. * sizeof(float) * N,
995     3. * sizeof(float) * N,
996     3. * sizeof(float) * N,
997     3. * sizeof(float) * N,
998     3. * sizeof(float) * N
999   };
1000 
1001   PetscFunctionBegin;
1002   /* --- SUMMARY --- */
1003   for (k = 1; k < NTIMES; ++k) { /* note -- skip first iteration */
1004     for (j = 0; j < 8; ++j) {
1005       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
1006       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
1007       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
1008     }
1009   }
1010 
1011   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
1012 
1013   for (j = 0; j < 8; ++j) {
1014      avgtime[j] = avgtime[j]/(float)(NTIMES-1);
1015      ierr = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
1016   }
1017   PetscFunctionReturn(0);
1018 }
1019