xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision c4762a1b19cd2af06abeed90e8f9d34fb975dd94)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 
28 #define N        10000000
29 #define NTIMES   10
30 
31 # ifndef MIN
32 # define MIN(x,y) ((x)<(y) ? (x) : (y))
33 # endif
34 # ifndef MAX
35 # define MAX(x,y) ((x)>(y) ? (x) : (y))
36 # endif
37 
38 const float  flt_eps = 1.192092896e-07f;
39 const double dbl_eps = 2.2204460492503131e-16;
40 
41 __global__ void set_array(float *a,  float value, size_t len)
42 {
43   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44   while (idx < len) {
45     a[idx] = value;
46     idx   += blockDim.x * gridDim.x;
47   }
48 }
49 
50 __global__ void set_array_double(double *a,  double value, size_t len)
51 {
52   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
53   while (idx < len) {
54     a[idx] = value;
55     idx   += blockDim.x * gridDim.x;
56   }
57 }
58 
59 __global__ void STREAM_Copy(float *a, float *b, size_t len)
60 {
61   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
62   while (idx < len) {
63     b[idx] = a[idx];
64     idx   += blockDim.x * gridDim.x;
65   }
66 }
67 
68 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
69 {
70   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
71   while (idx < len) {
72     b[idx] = a[idx];
73     idx   += blockDim.x * gridDim.x;
74   }
75 }
76 
77 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
78 {
79   /*
80    * Ensure size of thread index space is as large as or greater than
81    * vector index space else return.
82    */
83   if (blockDim.x * gridDim.x < len) return;
84   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
85   if (idx < len) b[idx] = a[idx];
86 }
87 
88 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
89 {
90   /*
91    * Ensure size of thread index space is as large as or greater than
92    * vector index space else return.
93    */
94   if (blockDim.x * gridDim.x < len) return;
95   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
96   if (idx < len) b[idx] = a[idx];
97 }
98 
99 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
100 {
101   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
102   while (idx < len) {
103     b[idx] = scale* a[idx];
104     idx   += blockDim.x * gridDim.x;
105   }
106 }
107 
108 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
109 {
110   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
111   while (idx < len) {
112     b[idx] = scale* a[idx];
113     idx   += blockDim.x * gridDim.x;
114   }
115 }
116 
117 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
118 {
119   /*
120    * Ensure size of thread index space is as large as or greater than
121    * vector index space else return.
122    */
123   if (blockDim.x * gridDim.x < len) return;
124   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
125   if (idx < len) b[idx] = scale* a[idx];
126 }
127 
128 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
129 {
130   /*
131    * Ensure size of thread index space is as large as or greater than
132    * vector index space else return.
133    */
134   if (blockDim.x * gridDim.x < len) return;
135   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
136   if (idx < len) b[idx] = scale* a[idx];
137 }
138 
139 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
140 {
141   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
142   while (idx < len) {
143     c[idx] = a[idx]+b[idx];
144     idx   += blockDim.x * gridDim.x;
145   }
146 }
147 
148 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
149 {
150   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
151   while (idx < len) {
152     c[idx] = a[idx]+b[idx];
153     idx   += blockDim.x * gridDim.x;
154   }
155 }
156 
157 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
158 {
159   /*
160    * Ensure size of thread index space is as large as or greater than
161    * vector index space else return.
162    */
163   if (blockDim.x * gridDim.x < len) return;
164   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
165   if (idx < len) c[idx] = a[idx]+b[idx];
166 }
167 
168 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
169 {
170   /*
171    * Ensure size of thread index space is as large as or greater than
172    * vector index space else return.
173    */
174   if (blockDim.x * gridDim.x < len) return;
175   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
176   if (idx < len) c[idx] = a[idx]+b[idx];
177 }
178 
179 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
180 {
181   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
182   while (idx < len) {
183     c[idx] = a[idx]+scalar*b[idx];
184     idx   += blockDim.x * gridDim.x;
185   }
186 }
187 
188 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
189 {
190   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
191   while (idx < len) {
192     c[idx] = a[idx]+scalar*b[idx];
193     idx   += blockDim.x * gridDim.x;
194   }
195 }
196 
197 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
198 {
199   /*
200    * Ensure size of thread index space is as large as or greater than
201    * vector index space else return.
202    */
203   if (blockDim.x * gridDim.x < len) return;
204   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
205   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
206 }
207 
208 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
209 {
210   /*
211    * Ensure size of thread index space is as large as or greater than
212    * vector index space else return.
213    */
214   if (blockDim.x * gridDim.x < len) return;
215   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
216   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
217 }
218 
219 /* Host side verification routines */
220 bool STREAM_Copy_verify(float *a, float *b, size_t len)
221 {
222   size_t idx;
223   bool   bDifferent = false;
224 
225   for (idx = 0; idx < len && !bDifferent; idx++) {
226     float expectedResult     = a[idx];
227     float diffResultExpected = (b[idx] - expectedResult);
228     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
229     /* element-wise relative error determination */
230     bDifferent = (relErrorULPS > 2.f);
231   }
232 
233   return bDifferent;
234 }
235 
236 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
237 {
238   size_t idx;
239   bool   bDifferent = false;
240 
241   for (idx = 0; idx < len && !bDifferent; idx++) {
242     double expectedResult     = a[idx];
243     double diffResultExpected = (b[idx] - expectedResult);
244     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
245     /* element-wise relative error determination */
246     bDifferent = (relErrorULPS > 2.);
247   }
248 
249   return bDifferent;
250 }
251 
252 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
253 {
254   size_t idx;
255   bool   bDifferent = false;
256 
257   for (idx = 0; idx < len && !bDifferent; idx++) {
258     float expectedResult     = scale*a[idx];
259     float diffResultExpected = (b[idx] - expectedResult);
260     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
261     /* element-wise relative error determination */
262     bDifferent = (relErrorULPS > 2.f);
263   }
264 
265   return bDifferent;
266 }
267 
268 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
269 {
270   size_t idx;
271   bool   bDifferent = false;
272 
273   for (idx = 0; idx < len && !bDifferent; idx++) {
274     double expectedResult     = scale*a[idx];
275     double diffResultExpected = (b[idx] - expectedResult);
276     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
277     /* element-wise relative error determination */
278     bDifferent = (relErrorULPS > 2.);
279   }
280 
281   return bDifferent;
282 }
283 
284 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
285 {
286   size_t idx;
287   bool   bDifferent = false;
288 
289   for (idx = 0; idx < len && !bDifferent; idx++) {
290     float expectedResult     = a[idx] + b[idx];
291     float diffResultExpected = (c[idx] - expectedResult);
292     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
293     /* element-wise relative error determination */
294     bDifferent = (relErrorULPS > 2.f);
295   }
296 
297   return bDifferent;
298 }
299 
300 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
301 {
302   size_t idx;
303   bool   bDifferent = false;
304 
305   for (idx = 0; idx < len && !bDifferent; idx++) {
306     double expectedResult     = a[idx] + b[idx];
307     double diffResultExpected = (c[idx] - expectedResult);
308     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
309     /* element-wise relative error determination */
310     bDifferent = (relErrorULPS > 2.);
311   }
312 
313   return bDifferent;
314 }
315 
316 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
317 {
318   size_t idx;
319   bool   bDifferent = false;
320 
321   for (idx = 0; idx < len && !bDifferent; idx++) {
322     float expectedResult     = a[idx] + scalar*b[idx];
323     float diffResultExpected = (c[idx] - expectedResult);
324     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
325     /* element-wise relative error determination */
326     bDifferent = (relErrorULPS > 3.f);
327   }
328 
329   return bDifferent;
330 }
331 
332 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
333 {
334   size_t idx;
335   bool   bDifferent = false;
336 
337   for (idx = 0; idx < len && !bDifferent; idx++) {
338     double expectedResult     = a[idx] + scalar*b[idx];
339     double diffResultExpected = (c[idx] - expectedResult);
340     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
341     /* element-wise relative error determination */
342     bDifferent = (relErrorULPS > 3.);
343   }
344 
345   return bDifferent;
346 }
347 
348 /* forward declarations */
349 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
350 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
351 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
353 
354 int main(int argc, char *argv[])
355 {
356   PetscInt       device    = 0;
357   PetscBool      runDouble = PETSC_TRUE;
358   const PetscBool cpuTiming = PETSC_TRUE; // must be true
359   PetscErrorCode ierr;
360 
361   ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr);
362 
363   ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr;
364 
365   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
366   ierr = PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0);CHKERRQ(ierr);
367   ierr = PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr);
368   ierr = PetscOptionsEnd();
369 
370   ierr = setupStream(device, runDouble, cpuTiming);
371   if (ierr) {
372     ierr = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr);
373   }
374   ierr = PetscFinalize();
375   return ierr;
376 }
377 
378 ///////////////////////////////////////////////////////////////////////////////
379 //Run the appropriate tests
380 ///////////////////////////////////////////////////////////////////////////////
381 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
382 {
383   PetscInt       iNumThreadsPerBlock = 128;
384   PetscErrorCode ierr;
385 
386   PetscFunctionBegin;
387   // Check device
388   {
389     int deviceCount;
390 
391     cudaGetDeviceCount(&deviceCount);
392     if (deviceCount == 0) {
393       ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr);
394       return -1000;
395     }
396 
397     if (deviceNum >= deviceCount || deviceNum < 0) {
398       ierr      = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr);
399       deviceNum = 0;
400     }
401   }
402 
403   cudaSetDevice(deviceNum);
404   // ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr);
405   cudaDeviceProp deviceProp;
406   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
407     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr);
408     return -1;
409   }
410 
411   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
412     ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr);
413     return -1;
414   }
415   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
416   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
417 
418   if (runDouble) {
419     ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
420   } else {
421     ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr);
422   }
423   PetscFunctionReturn(0);
424 }
425 
426 ///////////////////////////////////////////////////////////////////////////
427 // runStream
428 ///////////////////////////////////////////////////////////////////////////
429 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
430 {
431   float          *d_a, *d_b, *d_c;
432   int            k;
433   float          times[8][NTIMES];
434   float          scalar;
435   PetscErrorCode ierr;
436 
437   PetscFunctionBegin;
438   /* Allocate memory on device */
439   ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr);
440   ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr);
441   ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr);
442 
443   /* Compute execution configuration */
444 
445   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
446   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
447   if (N % dimBlock.x != 0) dimGrid.x+=1;
448 
449   /* Initialize memory on the device */
450   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
451   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
452   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
453 
454   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
455   PetscLogDouble cpuTimer = 0.0;
456 
457   scalar=3.0f;
458   for (k = 0; k < NTIMES; ++k) {
459     PetscTimeSubtract(&cpuTimer);
460     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
461     cudaStreamSynchronize(NULL);
462     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
463     PetscTimeAdd(&cpuTimer);
464     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
465 
466     cpuTimer = 0.0;
467     PetscTimeSubtract(&cpuTimer);
468     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
469     cudaStreamSynchronize(NULL);
470     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
471     //get the total elapsed time in ms
472     PetscTimeAdd(&cpuTimer);
473     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
474 
475     cpuTimer = 0.0;
476     PetscTimeSubtract(&cpuTimer);
477     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
478     cudaStreamSynchronize(NULL);
479     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
480     //get the total elapsed time in ms
481     PetscTimeAdd(&cpuTimer);
482     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
483 
484     cpuTimer = 0.0;
485     PetscTimeSubtract(&cpuTimer);
486     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
487     cudaStreamSynchronize(NULL);
488     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
489     //get the total elapsed time in ms
490     PetscTimeAdd(&cpuTimer);
491     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
492 
493     cpuTimer = 0.0;
494     PetscTimeSubtract(&cpuTimer);
495     // ierr = cudaEventRecord(start, 0);CHKERRQ(ierr);
496     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
497     cudaStreamSynchronize(NULL);
498     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);    // ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr);
499     // ierr = cudaEventSynchronize(stop);CHKERRQ(ierr);
500     //get the total elapsed time in ms
501     PetscTimeAdd(&cpuTimer);
502     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
503     else {
504       // ierr = cudaEventElapsedTime(&times[4][k], start, stop);CHKERRQ(ierr);
505     }
506 
507     cpuTimer = 0.0;
508     PetscTimeSubtract(&cpuTimer);
509     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
510     cudaStreamSynchronize(NULL);
511     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
512     //get the total elapsed time in ms
513     PetscTimeAdd(&cpuTimer);
514     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
515 
516     cpuTimer = 0.0;
517     PetscTimeSubtract(&cpuTimer);
518     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
519     cudaStreamSynchronize(NULL);
520     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
521     //get the total elapsed time in ms
522     PetscTimeAdd(&cpuTimer);
523     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
524 
525     cpuTimer = 0.0;
526     PetscTimeSubtract(&cpuTimer);
527     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
528     cudaStreamSynchronize(NULL);
529     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
530     //get the total elapsed time in ms
531     PetscTimeAdd(&cpuTimer);
532     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
533   }
534 
535   if (1) { /* verify kernels */
536   float *h_a, *h_b, *h_c;
537   bool  errorSTREAMkernel = true;
538 
539   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
540     printf("Unable to allocate array h_a, exiting ...\n");
541     exit(1);
542   }
543   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
544     printf("Unable to allocate array h_b, exiting ...\n");
545     exit(1);
546   }
547 
548   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
549     printf("Unalbe to allocate array h_c, exiting ...\n");
550     exit(1);
551   }
552 
553   /*
554    * perform kernel, copy device memory into host memory and verify each
555    * device kernel output
556    */
557 
558   /* Initialize memory on the device */
559   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
560   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
561   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
562 
563   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
564   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
565   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
566   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
567   if (errorSTREAMkernel) {
568     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
569     exit(-2000);
570   }
571 
572   /* Initialize memory on the device */
573   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
574   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
575   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
576 
577   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
578   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
579   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
580   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
581   if (errorSTREAMkernel) {
582     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
583     exit(-3000);
584   }
585 
586   /* Initialize memory on the device */
587   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
588   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
589   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
590 
591   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
592   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
593   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
594   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
595   if (errorSTREAMkernel) {
596     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
597     exit(-4000);
598   }
599 
600   /* Initialize memory on the device */
601   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
602   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
603   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
604 
605   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
606   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
607   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
608   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
609   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
610   if (errorSTREAMkernel) {
611     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
612     exit(-5000);
613   }
614 
615   /* Initialize memory on the device */
616   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
617   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
618   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
619 
620   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
621   ierr              = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
622   ierr              = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
623   ierr              = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
624   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
625   if (errorSTREAMkernel) {
626     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
627     exit(-6000);
628   }
629 
630   free(h_a);
631   free(h_b);
632   free(h_c);
633   }
634   /* continue from here */
635   printResultsReadable(times, sizeof(float));
636 
637   /* Free memory on device */
638   ierr = cudaFree(d_a);CHKERRQ(ierr);
639   ierr = cudaFree(d_b);CHKERRQ(ierr);
640   ierr = cudaFree(d_c);CHKERRQ(ierr);
641 
642   PetscFunctionReturn(0);
643 }
644 
645 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
646 {
647   double         *d_a, *d_b, *d_c;
648   int            k;
649   float          times[8][NTIMES];
650   double         scalar;
651   PetscErrorCode ierr;
652 
653   PetscFunctionBegin;
654   /* Allocate memory on device */
655   ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr);
656   ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr);
657   ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr);
658 
659   /* Compute execution configuration */
660 
661   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
662   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
663   if (N % dimBlock.x != 0) dimGrid.x+=1;
664 
665   /* Initialize memory on the device */
666   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
667   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
668   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
669 
670   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
671   PetscLogDouble cpuTimer = 0.0;
672 
673   scalar=3.0;
674   for (k = 0; k < NTIMES; ++k) {
675     PetscTimeSubtract(&cpuTimer);
676     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
677     cudaStreamSynchronize(NULL);
678     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
679     //get the total elapsed time in ms
680     if (bDontUseGPUTiming) {
681       PetscTimeAdd(&cpuTimer);
682       times[0][k] = cpuTimer*1.e3;
683     }
684 
685     cpuTimer = 0.0;
686     PetscTimeSubtract(&cpuTimer);
687     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
688     cudaStreamSynchronize(NULL);
689     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
690     //get the total elapsed time in ms
691     if (bDontUseGPUTiming) {
692       PetscTimeAdd(&cpuTimer);
693       times[1][k] = cpuTimer*1.e3;
694     }
695 
696     cpuTimer = 0.0;
697     PetscTimeSubtract(&cpuTimer);
698     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
699     cudaStreamSynchronize(NULL);
700     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
701     //get the total elapsed time in ms
702     PetscTimeAdd(&cpuTimer);
703     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
704 
705     cpuTimer = 0.0;
706     PetscTimeSubtract(&cpuTimer);
707     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
708     cudaStreamSynchronize(NULL);
709     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
710     //get the total elapsed time in ms
711     PetscTimeAdd(&cpuTimer);
712     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
713 
714     cpuTimer = 0.0;
715     PetscTimeSubtract(&cpuTimer);
716     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
717     cudaStreamSynchronize(NULL);
718     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
719     //get the total elapsed time in ms
720     PetscTimeAdd(&cpuTimer);
721     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
722 
723     cpuTimer = 0.0;
724     PetscTimeSubtract(&cpuTimer);
725     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
726     cudaStreamSynchronize(NULL);
727     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
728     //get the total elapsed time in ms
729     PetscTimeAdd(&cpuTimer);
730     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
731 
732     cpuTimer = 0.0;
733     PetscTimeSubtract(&cpuTimer);
734     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
735     cudaStreamSynchronize(NULL);
736     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
737     //get the total elapsed time in ms
738     PetscTimeAdd(&cpuTimer);
739     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
740 
741     cpuTimer = 0.0;
742     PetscTimeSubtract(&cpuTimer);
743     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
744     cudaStreamSynchronize(NULL);
745     ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRQ(ierr);
746     //get the total elapsed time in ms
747     PetscTimeAdd(&cpuTimer);
748     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
749   }
750 
751   if (1) { /* verify kernels */
752   double *h_a, *h_b, *h_c;
753   bool   errorSTREAMkernel = true;
754 
755   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
756     printf("Unable to allocate array h_a, exiting ...\n");
757     exit(1);
758   }
759   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
760     printf("Unable to allocate array h_b, exiting ...\n");
761     exit(1);
762   }
763 
764   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
765     printf("Unalbe to allocate array h_c, exiting ...\n");
766     exit(1);
767   }
768 
769   /*
770    * perform kernel, copy device memory into host memory and verify each
771    * device kernel output
772    */
773 
774   /* Initialize memory on the device */
775   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
776   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
777   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
778 
779   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
780   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
781   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
782   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
783   if (errorSTREAMkernel) {
784     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr);
785     exit(-2000);
786   }
787 
788   /* Initialize memory on the device */
789   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
790   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
791   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
792 
793   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
794   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
795   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
796   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
797   if (errorSTREAMkernel) {
798     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr);
799     exit(-3000);
800   }
801 
802   /* Initialize memory on the device */
803   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
804   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
805 
806   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
807   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
808   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
809   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
810   if (errorSTREAMkernel) {
811     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr);
812     exit(-4000);
813   }
814 
815   /* Initialize memory on the device */
816   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
817   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
818   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
819 
820   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
821   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
822   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
823   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
824   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
825   if (errorSTREAMkernel) {
826     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr);
827     exit(-5000);
828   }
829 
830   /* Initialize memory on the device */
831   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
832   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
833   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
834 
835   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
836   ierr              = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
837   ierr              = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
838   ierr              = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr);
839   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
840   if (errorSTREAMkernel) {
841     ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr);
842     exit(-6000);
843   }
844 
845   free(h_a);
846   free(h_b);
847   free(h_c);
848   }
849   /* continue from here */
850   printResultsReadable(times,sizeof(double));
851 
852   /* Free memory on device */
853   ierr = cudaFree(d_a);CHKERRQ(ierr);
854   ierr = cudaFree(d_b);CHKERRQ(ierr);
855   ierr = cudaFree(d_c);CHKERRQ(ierr);
856 
857   PetscFunctionReturn(0);
858 }
859 
860 ///////////////////////////////////////////////////////////////////////////
861 //Print Results to Screen and File
862 ///////////////////////////////////////////////////////////////////////////
863 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
864 {
865   PetscErrorCode ierr;
866   PetscInt       j, k;
867   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
868   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
869   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
870   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
871   const float    bytes_per_kernel[8] = {
872     2. * bsize * N,
873     2. * bsize * N,
874     2. * bsize * N,
875     2. * bsize * N,
876     3. * bsize * N,
877     3. * bsize * N,
878     3. * bsize * N,
879     3. * bsize * N
880   };
881   double         rate,irate;
882   int            rank,size;
883   PetscFunctionBegin;
884   ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRQ(ierr);
885   ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRQ(ierr);
886   /* --- SUMMARY --- */
887   for (k = 0; k < NTIMES; ++k) {
888     for (j = 0; j < 8; ++j) {
889       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
890       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
891       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
892     }
893   }
894   for (j = 0; j < 8; ++j) {
895     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
896   }
897   j = 7;
898   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
899   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
900   if (!rank) {
901     FILE *fd;
902     if (size == 1) {
903       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
904       fd = fopen("flops","w");
905       fprintf(fd,"%g\n",rate);
906       fclose(fd);
907     } else {
908       double prate;
909       fd = fopen("flops","r");
910       fscanf(fd,"%lg",&prate);
911       fclose(fd);
912       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
913     }
914   }
915 
916   PetscFunctionReturn(0);
917 }
918