xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 2e826d6f168293946e7dc17f87075462a7cd4791)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in single precision.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char *help = "Single-Precision STREAM Benchmark implementation in CUDA\n"
23                     "Performs Copy, Scale, Add, and Triad single-precision kernels\n\n";
24 
25 #include <petscconf.h>
26 #include <petscsys.h>
27 #include <petsctime.h>
28 
29 #define N        2000000
30 #define N_DOUBLE 8000000
31 #define NTIMES   10
32 
33 # ifndef MIN
34 # define MIN(x,y) ((x)<(y) ? (x) : (y))
35 # endif
36 # ifndef MAX
37 # define MAX(x,y) ((x)>(y) ? (x) : (y))
38 # endif
39 
40 const float  flt_eps = 1.192092896e-07f;
41 const double dbl_eps = 2.2204460492503131e-16;
42 
43 __global__ void set_array(float *a,  float value, size_t len)
44 {
45   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
46   while (idx < len) {
47     a[idx] = value;
48     idx   += blockDim.x * gridDim.x;
49   }
50 }
51 
52 __global__ void set_array_double(double *a,  double value, size_t len)
53 {
54   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
55   while (idx < len) {
56     a[idx] = value;
57     idx   += blockDim.x * gridDim.x;
58   }
59 }
60 
61 __global__ void STREAM_Copy(float *a, float *b, size_t len)
62 {
63   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
64   while (idx < len) {
65     b[idx] = a[idx];
66     idx   += blockDim.x * gridDim.x;
67   }
68 }
69 
70 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
71 {
72   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
73   while (idx < len) {
74     b[idx] = a[idx];
75     idx   += blockDim.x * gridDim.x;
76   }
77 }
78 
79 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
80 {
81   /*
82    * Ensure size of thread index space is as large as or greater than
83    * vector index space else return.
84    */
85   if (blockDim.x * gridDim.x < len) return;
86   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
87   if (idx < len) b[idx] = a[idx];
88 }
89 
90 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
91 {
92   /*
93    * Ensure size of thread index space is as large as or greater than
94    * vector index space else return.
95    */
96   if (blockDim.x * gridDim.x < len) return;
97   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
98   if (idx < len) b[idx] = a[idx];
99 }
100 
101 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
102 {
103   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104   while (idx < len) {
105     b[idx] = scale* a[idx];
106     idx   += blockDim.x * gridDim.x;
107   }
108 }
109 
110 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
111 {
112   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
113   while (idx < len) {
114     b[idx] = scale* a[idx];
115     idx   += blockDim.x * gridDim.x;
116   }
117 }
118 
119 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
120 {
121   /*
122    * Ensure size of thread index space is as large as or greater than
123    * vector index space else return.
124    */
125   if (blockDim.x * gridDim.x < len) return;
126   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127   if (idx < len) b[idx] = scale* a[idx];
128 }
129 
130 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
131 {
132   /*
133    * Ensure size of thread index space is as large as or greater than
134    * vector index space else return.
135    */
136   if (blockDim.x * gridDim.x < len) return;
137   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
138   if (idx < len) b[idx] = scale* a[idx];
139 }
140 
141 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
142 {
143   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
144   while (idx < len) {
145     c[idx] = a[idx]+b[idx];
146     idx   += blockDim.x * gridDim.x;
147   }
148 }
149 
150 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
151 {
152   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153   while (idx < len) {
154     c[idx] = a[idx]+b[idx];
155     idx   += blockDim.x * gridDim.x;
156   }
157 }
158 
159 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
160 {
161   /*
162    * Ensure size of thread index space is as large as or greater than
163    * vector index space else return.
164    */
165   if (blockDim.x * gridDim.x < len) return;
166   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
167   if (idx < len) c[idx] = a[idx]+b[idx];
168 }
169 
170 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
171 {
172   /*
173    * Ensure size of thread index space is as large as or greater than
174    * vector index space else return.
175    */
176   if (blockDim.x * gridDim.x < len) return;
177   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
178   if (idx < len) c[idx] = a[idx]+b[idx];
179 }
180 
181 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
182 {
183   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
184   while (idx < len) {
185     c[idx] = a[idx]+scalar*b[idx];
186     idx   += blockDim.x * gridDim.x;
187   }
188 }
189 
190 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
191 {
192   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
193   while (idx < len) {
194     c[idx] = a[idx]+scalar*b[idx];
195     idx   += blockDim.x * gridDim.x;
196   }
197 }
198 
199 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
200 {
201   /*
202    * Ensure size of thread index space is as large as or greater than
203    * vector index space else return.
204    */
205   if (blockDim.x * gridDim.x < len) return;
206   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
207   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
208 }
209 
210 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
211 {
212   /*
213    * Ensure size of thread index space is as large as or greater than
214    * vector index space else return.
215    */
216   if (blockDim.x * gridDim.x < len) return;
217   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
218   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
219 }
220 
221 /* Host side verification routines */
222 bool STREAM_Copy_verify(float *a, float *b, size_t len)
223 {
224   size_t idx;
225   bool   bDifferent = false;
226 
227   for (idx = 0; idx < len && !bDifferent; idx++) {
228     float expectedResult     = a[idx];
229     float diffResultExpected = (b[idx] - expectedResult);
230     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
231     /* element-wise relative error determination */
232     bDifferent = (relErrorULPS > 2.f);
233   }
234 
235   return bDifferent;
236 }
237 
238 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
239 {
240   size_t idx;
241   bool   bDifferent = false;
242 
243   for (idx = 0; idx < len && !bDifferent; idx++) {
244     double expectedResult     = a[idx];
245     double diffResultExpected = (b[idx] - expectedResult);
246     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
247     /* element-wise relative error determination */
248     bDifferent = (relErrorULPS > 2.);
249   }
250 
251   return bDifferent;
252 }
253 
254 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
255 {
256   size_t idx;
257   bool   bDifferent = false;
258 
259   for (idx = 0; idx < len && !bDifferent; idx++) {
260     float expectedResult     = scale*a[idx];
261     float diffResultExpected = (b[idx] - expectedResult);
262     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
263     /* element-wise relative error determination */
264     bDifferent = (relErrorULPS > 2.f);
265   }
266 
267   return bDifferent;
268 }
269 
270 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
271 {
272   size_t idx;
273   bool   bDifferent = false;
274 
275   for (idx = 0; idx < len && !bDifferent; idx++) {
276     double expectedResult     = scale*a[idx];
277     double diffResultExpected = (b[idx] - expectedResult);
278     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
279     /* element-wise relative error determination */
280     bDifferent = (relErrorULPS > 2.);
281   }
282 
283   return bDifferent;
284 }
285 
286 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
287 {
288   size_t idx;
289   bool   bDifferent = false;
290 
291   for (idx = 0; idx < len && !bDifferent; idx++) {
292     float expectedResult     = a[idx] + b[idx];
293     float diffResultExpected = (c[idx] - expectedResult);
294     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
295     /* element-wise relative error determination */
296     bDifferent = (relErrorULPS > 2.f);
297   }
298 
299   return bDifferent;
300 }
301 
302 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
303 {
304   size_t idx;
305   bool   bDifferent = false;
306 
307   for (idx = 0; idx < len && !bDifferent; idx++) {
308     double expectedResult     = a[idx] + b[idx];
309     double diffResultExpected = (c[idx] - expectedResult);
310     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
311     /* element-wise relative error determination */
312     bDifferent = (relErrorULPS > 2.);
313   }
314 
315   return bDifferent;
316 }
317 
318 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
319 {
320   size_t idx;
321   bool   bDifferent = false;
322 
323   for (idx = 0; idx < len && !bDifferent; idx++) {
324     float expectedResult     = a[idx] + scalar*b[idx];
325     float diffResultExpected = (c[idx] - expectedResult);
326     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
327     /* element-wise relative error determination */
328     bDifferent = (relErrorULPS > 3.f);
329   }
330 
331   return bDifferent;
332 }
333 
334 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
335 {
336   size_t idx;
337   bool   bDifferent = false;
338 
339   for (idx = 0; idx < len && !bDifferent; idx++) {
340     double expectedResult     = a[idx] + scalar*b[idx];
341     double diffResultExpected = (c[idx] - expectedResult);
342     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
343     /* element-wise relative error determination */
344     bDifferent = (relErrorULPS > 3.);
345   }
346 
347   return bDifferent;
348 }
349 
350 /* forward declarations */
351 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
352 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
354 PetscErrorCode printResultsReadable(float times[][NTIMES]);
355 
356 int main(int argc, char *argv[])
357 {
358   PetscInt       device    = 0;
359   PetscBool      runDouble = PETSC_FALSE;
360   PetscBool      cpuTiming = PETSC_FALSE;
361   PetscErrorCode ierr;
362 
363   ierr = PetscInitialize(&argc, &argv, 0, help);CHKERRQ(ierr);
364   ierr = PetscPrintf(PETSC_COMM_SELF, "[Single and Double-Precision Device-Only STREAM Benchmark implementation in CUDA]\n");CHKERRQ(ierr);
365   ierr = PetscPrintf(PETSC_COMM_SELF, "%s Starting...\n\n", argv[0]);CHKERRQ(ierr);
366 
367   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
368   ierr = PetscOptionsInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL);CHKERRQ(ierr);
369   ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr);
370   ierr = PetscOptionsBool("-cputiming", "Force CPU-based timing to be used", "STREAM", cpuTiming, &cpuTiming, NULL);CHKERRQ(ierr);
371   ierr = PetscOptionsEnd();
372 
373   ierr = setupStream(device, runDouble, cpuTiming);
374   if (ierr >= 0) {
375     PetscErrorCode ierr2 = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr2);
376   }
377   PetscFinalize();
378   return 0;
379 }
380 
381 ///////////////////////////////////////////////////////////////////////////////
382 //Run the appropriate tests
383 ///////////////////////////////////////////////////////////////////////////////
384 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
385 {
386   PetscInt       iNumThreadsPerBlock = 128;
387   PetscErrorCode ierr;
388 
389   PetscFunctionBegin;
390   // Check device
391   {
392     int deviceCount;
393 
394     cudaGetDeviceCount(&deviceCount);
395     if (deviceCount == 0) {
396       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
397       return -1000;
398     }
399 
400     if (deviceNum >= deviceCount || deviceNum < 0) {
401       ierr      = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
402       deviceNum = 0;
403     }
404   }
405 
406   cudaSetDevice(deviceNum);
407   ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
408   cudaDeviceProp deviceProp;
409   if (cudaGetDeviceProperties(&deviceProp, deviceNum) == cudaSuccess) {
410     ierr = PetscPrintf(PETSC_COMM_SELF, " Device %d: %s\n", deviceNum, deviceProp.name);CHKERRQ(ierr);
411   } else {
412     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
413     return -1;
414   }
415 
416   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
417     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
418     return -1;
419   }
420   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
421   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
422 
423   if (cpuTiming) {
424     ierr = PetscPrintf(PETSC_COMM_SELF, " Using cpu-only timer.\n");CHKERRQ(ierr);
425   }
426 
427   ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
428   if (runDouble) {
429     ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
430     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
431   }
432   PetscFunctionReturn(0);
433 }
434 
435 ///////////////////////////////////////////////////////////////////////////
436 // runStream
437 ///////////////////////////////////////////////////////////////////////////
438 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
439 {
440   float          *d_a, *d_b, *d_c;
441   int            k;
442   float          times[8][NTIMES];
443   float          scalar;
444   PetscErrorCode ierr;
445 
446   PetscFunctionBegin;
447   /* Allocate memory on device */
448   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
449   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
450   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
451 
452   /* Compute execution configuration */
453 
454   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
455   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
456   if (N % dimBlock.x != 0) dimGrid.x+=1;
457 
458   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (single precision) = %u\n",N);CHKERRQ(ierr);
459   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
460 
461   /* Initialize memory on the device */
462   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
463   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
464   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
465 
466   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
467   PetscLogDouble cpuTimer = 0.0;
468   cudaEvent_t    start, stop;
469 
470   /* both timers report msec */
471   ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */
472   ierr = cudaEventCreate(&stop);CHKERRQ(ierr);  /* gpu timer facility */
473 
474   scalar=3.0f;
475   for (k = 0; k < NTIMES; ++k) {
476     PetscTimeSubtract(&cpuTimer);
477     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
478     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
479     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
480     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
481     //get the total elapsed time in ms
482     PetscTimeAdd(&cpuTimer);
483     if (bDontUseGPUTiming) times[0][k] = cpuTimer;
484     else {
485       ierr = cudaEventElapsedTime(&times[0][k], start, stop);CHKERRQ(ierr);
486     }
487 
488     cpuTimer = 0.0;
489     PetscTimeSubtract(&cpuTimer);
490     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
491     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
492     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
493     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
494     //get the total elapsed time in ms
495     PetscTimeAdd(&cpuTimer);
496     if (bDontUseGPUTiming) times[1][k] = cpuTimer;
497     else {
498       ierr = cudaEventElapsedTime(&times[1][k], start, stop);CHKERRQ(ierr);
499     }
500 
501     cpuTimer = 0.0;
502     PetscTimeSubtract(&cpuTimer);
503     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
504     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
505     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
506     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
507     //get the total elapsed time in ms
508     PetscTimeAdd(&cpuTimer);
509     if (bDontUseGPUTiming) times[2][k] = cpuTimer;
510     else {
511       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
512     }
513 
514     cpuTimer = 0.0;
515     PetscTimeSubtract(&cpuTimer);
516     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
517     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
518     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
519     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
520     //get the total elapsed time in ms
521     PetscTimeAdd(&cpuTimer);
522     if (bDontUseGPUTiming) times[3][k] = cpuTimer;
523     else {
524       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
525     }
526 
527     cpuTimer = 0.0;
528     PetscTimeSubtract(&cpuTimer);
529     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
530     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
531     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
532     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
533     //get the total elapsed time in ms
534     PetscTimeAdd(&cpuTimer);
535     if (bDontUseGPUTiming) times[4][k] = cpuTimer;
536     else {
537       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
538     }
539 
540     cpuTimer = 0.0;
541     PetscTimeSubtract(&cpuTimer);
542     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
543     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
544     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
545     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
546     //get the total elapsed time in ms
547     PetscTimeAdd(&cpuTimer);
548     if (bDontUseGPUTiming) times[5][k] = cpuTimer;
549     else {
550       ierr = cudaEventElapsedTime(&times[5][k], start, stop);CHKERRQ(ierr);
551     }
552 
553     cpuTimer = 0.0;
554     PetscTimeSubtract(&cpuTimer);
555     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
556     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
557     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
558     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
559     //get the total elapsed time in ms
560     PetscTimeAdd(&cpuTimer);
561     if (bDontUseGPUTiming) times[6][k] = cpuTimer;
562     else {
563       ierr = cudaEventElapsedTime(&times[6][k], start, stop);CHKERRQ(ierr);
564     }
565 
566     cpuTimer = 0.0;
567     PetscTimeSubtract(&cpuTimer);
568     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
569     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
570     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
571     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
572     //get the total elapsed time in ms
573     PetscTimeAdd(&cpuTimer);
574     if (bDontUseGPUTiming) times[7][k] = cpuTimer;
575     else {
576       ierr = cudaEventElapsedTime(&times[7][k], start, stop);CHKERRQ(ierr);
577     }
578 
579   }
580 
581   /* verify kernels */
582   float *h_a, *h_b, *h_c;
583   bool  errorSTREAMkernel = true;
584 
585   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
586     printf("Unable to allocate array h_a, exiting ...\n");
587     exit(1);
588   }
589   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
590     printf("Unable to allocate array h_b, exiting ...\n");
591     exit(1);
592   }
593 
594   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
595     printf("Unalbe to allocate array h_c, exiting ...\n");
596     exit(1);
597   }
598 
599   /*
600    * perform kernel, copy device memory into host memory and verify each
601    * device kernel output
602    */
603 
604   /* Initialize memory on the device */
605   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
606   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
607   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
608 
609   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
610   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
611   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
612   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
613   if (errorSTREAMkernel) {
614     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
615     exit(-2000);
616   } else {
617     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
618   }
619 
620   /* Initialize memory on the device */
621   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
622   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
623   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
624 
625   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
626   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
627   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
628   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
629   if (errorSTREAMkernel) {
630     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
631     exit(-3000);
632   } else {
633     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
634   }
635 
636   /* Initialize memory on the device */
637   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
638   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
639 
640   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
641   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
642   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
643   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
644   if (errorSTREAMkernel) {
645     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
646     exit(-4000);
647   } else {
648     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
649   }
650 
651   /* Initialize memory on the device */
652   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
653   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
654   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
655 
656   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
657   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
658   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
659   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
660   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
661   if (errorSTREAMkernel) {
662     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
663     exit(-5000);
664   } else {
665     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
666   }
667 
668   /* Initialize memory on the device */
669   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
670   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
671   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
672 
673   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
674   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
675   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
676   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
677   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
678   if (errorSTREAMkernel) {
679     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
680     exit(-6000);
681   } else {
682     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
683   }
684 
685   /* continue from here */
686   printResultsReadable(times);
687 
688   //clean up timers
689   ierr = cudaEventDestroy(stop);CHKERRQ(ierr);
690   ierr = cudaEventDestroy(start);CHKERRQ(ierr);
691 
692   /* Free memory on device */
693   ierr = cudaFree(d_a);CHKERRQ(ierr);
694   ierr = cudaFree(d_b);CHKERRQ(ierr);
695   ierr = cudaFree(d_c);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
700 {
701   double         *d_a, *d_b, *d_c;
702   int            k;
703   float          times[8][NTIMES];
704   double         scalar;
705   PetscErrorCode ierr;
706 
707   PetscFunctionBegin;
708   /* Allocate memory on device */
709   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
710   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
711   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
712 
713   /* Compute execution configuration */
714 
715   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
716   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
717   if (N % dimBlock.x != 0) dimGrid.x+=1;
718 
719   ierr = PetscPrintf(PETSC_COMM_SELF, " Array size (double precision) = %u\n",N);CHKERRQ(ierr);
720   ierr = PetscPrintf(PETSC_COMM_SELF, " using %u threads per block, %u blocks\n",dimBlock.x,dimGrid.x);CHKERRQ(ierr);
721 
722   /* Initialize memory on the device */
723   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
724   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
725   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
726 
727   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
728   PetscLogDouble cpuTimer = 0.0;
729   cudaEvent_t    start, stop;
730 
731   /* both timers report msec */
732   ierr = cudaEventCreate(&start);CHKERRQ(ierr); /* gpu timer facility */
733   ierr = cudaEventCreate(&stop);CHKERRQ(ierr);  /* gpu timer facility */
734 
735   scalar=3.0;
736   for (k = 0; k < NTIMES; ++k) {
737     PetscTimeSubtract(&cpuTimer);
738     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
739     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
740     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
741     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
742     //get the total elapsed time in ms
743     if (bDontUseGPUTiming) {
744       PetscTimeAdd(&cpuTimer);
745       times[0][k] = cpuTimer;
746     } else {
747       ierr = cudaEventElapsedTime(&times[0][k], start, stop);CHKERRQ(ierr);
748     }
749 
750     cpuTimer = 0.0;
751     PetscTimeSubtract(&cpuTimer);
752     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
753     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
754     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
755     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
756     //get the total elapsed time in ms
757     if (bDontUseGPUTiming) {
758       PetscTimeAdd(&cpuTimer);
759       times[1][k] = cpuTimer;
760     } else {
761       ierr = cudaEventElapsedTime(&times[1][k], start, stop);CHKERRQ(ierr);
762     }
763 
764     cpuTimer = 0.0;
765     PetscTimeSubtract(&cpuTimer);
766     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
767     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
768     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
769     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
770     //get the total elapsed time in ms
771     PetscTimeAdd(&cpuTimer);
772     if (bDontUseGPUTiming) times[2][k] = cpuTimer;
773     else {
774       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
775     }
776 
777     cpuTimer = 0.0;
778     PetscTimeSubtract(&cpuTimer);
779     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
780     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
781     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
782     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
783     //get the total elapsed time in ms
784     PetscTimeAdd(&cpuTimer);
785     if (bDontUseGPUTiming) times[3][k] = cpuTimer;
786     else {
787       ierr = cudaEventElapsedTime(&times[2][k], start, stop);CHKERRQ(ierr);
788     }
789 
790     cpuTimer = 0.0;
791     PetscTimeSubtract(&cpuTimer);
792     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
793     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
794     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
795     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
796     //get the total elapsed time in ms
797     PetscTimeAdd(&cpuTimer);
798     if (bDontUseGPUTiming) times[4][k] = cpuTimer;
799     else {
800       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
801     }
802 
803     cpuTimer = 0.0;
804     PetscTimeSubtract(&cpuTimer);
805     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
806     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
807     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
808     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
809     //get the total elapsed time in ms
810     PetscTimeAdd(&cpuTimer);
811     if (bDontUseGPUTiming) times[5][k] = cpuTimer;
812     else {
813       ierr = cudaEventElapsedTime(&times[3][k], start, stop);CHKERRQ(ierr);
814     }
815 
816     cpuTimer = 0.0;
817     PetscTimeSubtract(&cpuTimer);
818     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
819     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
820     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
821     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
822     //get the total elapsed time in ms
823     PetscTimeAdd(&cpuTimer);
824     if (bDontUseGPUTiming) times[6][k] = cpuTimer;
825     else {
826       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
827     }
828 
829     cpuTimer = 0.0;
830     PetscTimeSubtract(&cpuTimer);
831     ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
832     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
833     ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
834     ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
835     //get the total elapsed time in ms
836     PetscTimeAdd(&cpuTimer);
837     if (bDontUseGPUTiming) times[7][k] = cpuTimer;
838     else {
839       ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
840     }
841 
842   }
843 
844   /* verify kernels */
845   double *h_a, *h_b, *h_c;
846   bool   errorSTREAMkernel = true;
847 
848   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
849     printf("Unable to allocate array h_a, exiting ...\n");
850     exit(1);
851   }
852   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
853     printf("Unable to allocate array h_b, exiting ...\n");
854     exit(1);
855   }
856 
857   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
858     printf("Unalbe to allocate array h_c, exiting ...\n");
859     exit(1);
860   }
861 
862   /*
863    * perform kernel, copy device memory into host memory and verify each
864    * device kernel output
865    */
866 
867   /* Initialize memory on the device */
868   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
869   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
870   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
871 
872   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
873   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
874   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
875   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
876   if (errorSTREAMkernel) {
877     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
878     exit(-2000);
879   } else {
880     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tPass\n");CHKERRQ(ierr);
881   }
882 
883   /* Initialize memory on the device */
884   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
885   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
886   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
887 
888   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
889   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
890   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
891   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
892   if (errorSTREAMkernel) {
893     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
894     exit(-3000);
895   } else {
896     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tPass\n");CHKERRQ(ierr);
897   }
898 
899   /* Initialize memory on the device */
900   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
901   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
902 
903   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
904   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
905   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
906   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
907   if (errorSTREAMkernel) {
908     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
909     exit(-4000);
910   } else {
911     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tPass\n");CHKERRQ(ierr);
912   }
913 
914   /* Initialize memory on the device */
915   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
916   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
917   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
918 
919   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
920   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
921   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
922   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
923   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
924   if (errorSTREAMkernel) {
925     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
926     exit(-5000);
927   } else {
928     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tPass\n");CHKERRQ(ierr);
929   }
930 
931   /* Initialize memory on the device */
932   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
933   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
934   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
935 
936   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
937   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
938   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
939   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
940   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
941   if (errorSTREAMkernel) {
942     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
943     exit(-6000);
944   } else {
945     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tPass\n");CHKERRQ(ierr);
946   }
947 
948   /* continue from here */
949   printResultsReadable(times);
950 
951   //clean up timers
952   ierr = cudaEventDestroy(stop);CHKERRQ(ierr);
953   ierr = cudaEventDestroy(start);CHKERRQ(ierr);
954 
955   /* Free memory on device */
956   ierr = cudaFree(d_a);CHKERRQ(ierr);
957   ierr = cudaFree(d_b);CHKERRQ(ierr);
958   ierr = cudaFree(d_c);CHKERRQ(ierr);
959   PetscFunctionReturn(0);
960 }
961 
962 ///////////////////////////////////////////////////////////////////////////
963 //Print Results to Screen and File
964 ///////////////////////////////////////////////////////////////////////////
965 PetscErrorCode printResultsReadable(float times[][NTIMES])
966 {
967   PetscErrorCode ierr;
968   PetscInt       j, k;
969   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
970   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
971   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
972   char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
973   float          bytes_per_kernel[8] = {
974     2. * sizeof(float) * N,
975     2. * sizeof(float) * N,
976     2. * sizeof(float) * N,
977     2. * sizeof(float) * N,
978     3. * sizeof(float) * N,
979     3. * sizeof(float) * N,
980     3. * sizeof(float) * N,
981     3. * sizeof(float) * N
982   };
983 
984   PetscFunctionBegin;
985   /* --- SUMMARY --- */
986   for (k = 1; k < NTIMES; ++k)   /* note -- skip first iteration */
987     for (j = 0; j < 8; ++j) {
988       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]);
989       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
990       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
991     }
992 
993   ierr = PetscPrintf(PETSC_COMM_SELF, "Function    Rate (MB/s)    Avg time      Min time      Max time\n");CHKERRQ(ierr);
994 
995   for (j = 0; j < 8; ++j) {
996     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
997     ierr       = PetscPrintf(PETSC_COMM_SELF, "%s%11.4f  %11.6f  %12.6f  %12.6f\n", label[j], 1.0E-06 * bytes_per_kernel[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);CHKERRQ(ierr);
998   }
999   PetscFunctionReturn(0);
1000 }
1001