xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision b122ec5aa1bd4469eb4e0673542fb7de3f411254)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice.h>
28 
29 #define N        10000000
30 #define NTIMES   10
31 
32 # ifndef MIN
33 # define MIN(x,y) ((x)<(y) ? (x) : (y))
34 # endif
35 # ifndef MAX
36 # define MAX(x,y) ((x)>(y) ? (x) : (y))
37 # endif
38 
39 const float  flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41 
42 __global__ void set_array(float *a,  float value, size_t len)
43 {
44   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45   while (idx < len) {
46     a[idx] = value;
47     idx   += blockDim.x * gridDim.x;
48   }
49 }
50 
51 __global__ void set_array_double(double *a,  double value, size_t len)
52 {
53   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54   while (idx < len) {
55     a[idx] = value;
56     idx   += blockDim.x * gridDim.x;
57   }
58 }
59 
60 __global__ void STREAM_Copy(float *a, float *b, size_t len)
61 {
62   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63   while (idx < len) {
64     b[idx] = a[idx];
65     idx   += blockDim.x * gridDim.x;
66   }
67 }
68 
69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70 {
71   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72   while (idx < len) {
73     b[idx] = a[idx];
74     idx   += blockDim.x * gridDim.x;
75   }
76 }
77 
78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79 {
80   /*
81    * Ensure size of thread index space is as large as or greater than
82    * vector index space else return.
83    */
84   if (blockDim.x * gridDim.x < len) return;
85   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86   if (idx < len) b[idx] = a[idx];
87 }
88 
89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90 {
91   /*
92    * Ensure size of thread index space is as large as or greater than
93    * vector index space else return.
94    */
95   if (blockDim.x * gridDim.x < len) return;
96   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97   if (idx < len) b[idx] = a[idx];
98 }
99 
100 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
101 {
102   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103   while (idx < len) {
104     b[idx] = scale* a[idx];
105     idx   += blockDim.x * gridDim.x;
106   }
107 }
108 
109 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
110 {
111   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112   while (idx < len) {
113     b[idx] = scale* a[idx];
114     idx   += blockDim.x * gridDim.x;
115   }
116 }
117 
118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
119 {
120   /*
121    * Ensure size of thread index space is as large as or greater than
122    * vector index space else return.
123    */
124   if (blockDim.x * gridDim.x < len) return;
125   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126   if (idx < len) b[idx] = scale* a[idx];
127 }
128 
129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
130 {
131   /*
132    * Ensure size of thread index space is as large as or greater than
133    * vector index space else return.
134    */
135   if (blockDim.x * gridDim.x < len) return;
136   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137   if (idx < len) b[idx] = scale* a[idx];
138 }
139 
140 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
141 {
142   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143   while (idx < len) {
144     c[idx] = a[idx]+b[idx];
145     idx   += blockDim.x * gridDim.x;
146   }
147 }
148 
149 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
150 {
151   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152   while (idx < len) {
153     c[idx] = a[idx]+b[idx];
154     idx   += blockDim.x * gridDim.x;
155   }
156 }
157 
158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
159 {
160   /*
161    * Ensure size of thread index space is as large as or greater than
162    * vector index space else return.
163    */
164   if (blockDim.x * gridDim.x < len) return;
165   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166   if (idx < len) c[idx] = a[idx]+b[idx];
167 }
168 
169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
170 {
171   /*
172    * Ensure size of thread index space is as large as or greater than
173    * vector index space else return.
174    */
175   if (blockDim.x * gridDim.x < len) return;
176   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177   if (idx < len) c[idx] = a[idx]+b[idx];
178 }
179 
180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181 {
182   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183   while (idx < len) {
184     c[idx] = a[idx]+scalar*b[idx];
185     idx   += blockDim.x * gridDim.x;
186   }
187 }
188 
189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190 {
191   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192   while (idx < len) {
193     c[idx] = a[idx]+scalar*b[idx];
194     idx   += blockDim.x * gridDim.x;
195   }
196 }
197 
198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199 {
200   /*
201    * Ensure size of thread index space is as large as or greater than
202    * vector index space else return.
203    */
204   if (blockDim.x * gridDim.x < len) return;
205   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
207 }
208 
209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210 {
211   /*
212    * Ensure size of thread index space is as large as or greater than
213    * vector index space else return.
214    */
215   if (blockDim.x * gridDim.x < len) return;
216   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
218 }
219 
220 /* Host side verification routines */
221 bool STREAM_Copy_verify(float *a, float *b, size_t len)
222 {
223   size_t idx;
224   bool   bDifferent = false;
225 
226   for (idx = 0; idx < len && !bDifferent; idx++) {
227     float expectedResult     = a[idx];
228     float diffResultExpected = (b[idx] - expectedResult);
229     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230     /* element-wise relative error determination */
231     bDifferent = (relErrorULPS > 2.f);
232   }
233 
234   return bDifferent;
235 }
236 
237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238 {
239   size_t idx;
240   bool   bDifferent = false;
241 
242   for (idx = 0; idx < len && !bDifferent; idx++) {
243     double expectedResult     = a[idx];
244     double diffResultExpected = (b[idx] - expectedResult);
245     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
246     /* element-wise relative error determination */
247     bDifferent = (relErrorULPS > 2.);
248   }
249 
250   return bDifferent;
251 }
252 
253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254 {
255   size_t idx;
256   bool   bDifferent = false;
257 
258   for (idx = 0; idx < len && !bDifferent; idx++) {
259     float expectedResult     = scale*a[idx];
260     float diffResultExpected = (b[idx] - expectedResult);
261     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
262     /* element-wise relative error determination */
263     bDifferent = (relErrorULPS > 2.f);
264   }
265 
266   return bDifferent;
267 }
268 
269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270 {
271   size_t idx;
272   bool   bDifferent = false;
273 
274   for (idx = 0; idx < len && !bDifferent; idx++) {
275     double expectedResult     = scale*a[idx];
276     double diffResultExpected = (b[idx] - expectedResult);
277     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
278     /* element-wise relative error determination */
279     bDifferent = (relErrorULPS > 2.);
280   }
281 
282   return bDifferent;
283 }
284 
285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286 {
287   size_t idx;
288   bool   bDifferent = false;
289 
290   for (idx = 0; idx < len && !bDifferent; idx++) {
291     float expectedResult     = a[idx] + b[idx];
292     float diffResultExpected = (c[idx] - expectedResult);
293     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
294     /* element-wise relative error determination */
295     bDifferent = (relErrorULPS > 2.f);
296   }
297 
298   return bDifferent;
299 }
300 
301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302 {
303   size_t idx;
304   bool   bDifferent = false;
305 
306   for (idx = 0; idx < len && !bDifferent; idx++) {
307     double expectedResult     = a[idx] + b[idx];
308     double diffResultExpected = (c[idx] - expectedResult);
309     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
310     /* element-wise relative error determination */
311     bDifferent = (relErrorULPS > 2.);
312   }
313 
314   return bDifferent;
315 }
316 
317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318 {
319   size_t idx;
320   bool   bDifferent = false;
321 
322   for (idx = 0; idx < len && !bDifferent; idx++) {
323     float expectedResult     = a[idx] + scalar*b[idx];
324     float diffResultExpected = (c[idx] - expectedResult);
325     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
326     /* element-wise relative error determination */
327     bDifferent = (relErrorULPS > 3.f);
328   }
329 
330   return bDifferent;
331 }
332 
333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334 {
335   size_t idx;
336   bool   bDifferent = false;
337 
338   for (idx = 0; idx < len && !bDifferent; idx++) {
339     double expectedResult     = a[idx] + scalar*b[idx];
340     double diffResultExpected = (c[idx] - expectedResult);
341     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
342     /* element-wise relative error determination */
343     bDifferent = (relErrorULPS > 3.);
344   }
345 
346   return bDifferent;
347 }
348 
349 /* forward declarations */
350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354 
355 int main(int argc, char *argv[])
356 {
357   PetscInt       device    = 0;
358   PetscBool      runDouble = PETSC_TRUE;
359   const PetscBool cpuTiming = PETSC_TRUE; // must be true
360   PetscErrorCode ierr;
361 
362   CHKERRCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
363 
364   CHKERRQ(PetscInitialize(&argc, &argv, 0, help));
365 
366   ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr);
367   CHKERRQ(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0));
368   CHKERRQ(PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL));
369   ierr = PetscOptionsEnd();
370 
371   ierr = setupStream(device, runDouble, cpuTiming);
372   if (ierr) {
373     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
374   }
375   CHKERRQ(PetscFinalize());
376   return 0;
377 }
378 
379 ///////////////////////////////////////////////////////////////////////////////
380 //Run the appropriate tests
381 ///////////////////////////////////////////////////////////////////////////////
382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
383 {
384   PetscInt       iNumThreadsPerBlock = 128;
385   PetscErrorCode ierr;
386 
387   PetscFunctionBegin;
388   // Check device
389   {
390     int deviceCount;
391 
392     cudaGetDeviceCount(&deviceCount);
393     if (deviceCount == 0) {
394       CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
395       return -1000;
396     }
397 
398     if (deviceNum >= deviceCount || deviceNum < 0) {
399       CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
400       deviceNum = 0;
401     }
402   }
403 
404   cudaSetDevice(deviceNum);
405   // CHKERRQ(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
406   cudaDeviceProp deviceProp;
407   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
408     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
409     return -1;
410   }
411 
412   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
413     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
414     return -1;
415   }
416   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
417   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
418 
419   if (runDouble) {
420     CHKERRQ(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
421   } else {
422     CHKERRQ(runStream(iNumThreadsPerBlock, cpuTiming));
423   }
424   PetscFunctionReturn(0);
425 }
426 
427 ///////////////////////////////////////////////////////////////////////////
428 // runStream
429 ///////////////////////////////////////////////////////////////////////////
430 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
431 {
432   float          *d_a, *d_b, *d_c;
433   int            k;
434   float          times[8][NTIMES];
435   float          scalar;
436   PetscErrorCode ierr;
437 
438   PetscFunctionBegin;
439   /* Allocate memory on device */
440   CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N));
441   CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N));
442   CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N));
443 
444   /* Compute execution configuration */
445 
446   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
447   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
448   if (N % dimBlock.x != 0) dimGrid.x+=1;
449 
450   /* Initialize memory on the device */
451   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
452   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
453   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
454 
455   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
456   PetscLogDouble cpuTimer = 0.0;
457 
458   scalar=3.0f;
459   for (k = 0; k < NTIMES; ++k) {
460     PetscTimeSubtract(&cpuTimer);
461     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
462     cudaStreamSynchronize(NULL);
463     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
464     PetscTimeAdd(&cpuTimer);
465     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
466 
467     cpuTimer = 0.0;
468     PetscTimeSubtract(&cpuTimer);
469     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
470     cudaStreamSynchronize(NULL);
471     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
472     //get the total elapsed time in ms
473     PetscTimeAdd(&cpuTimer);
474     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
475 
476     cpuTimer = 0.0;
477     PetscTimeSubtract(&cpuTimer);
478     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
479     cudaStreamSynchronize(NULL);
480     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
481     //get the total elapsed time in ms
482     PetscTimeAdd(&cpuTimer);
483     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
484 
485     cpuTimer = 0.0;
486     PetscTimeSubtract(&cpuTimer);
487     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
488     cudaStreamSynchronize(NULL);
489     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
490     //get the total elapsed time in ms
491     PetscTimeAdd(&cpuTimer);
492     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
493 
494     cpuTimer = 0.0;
495     PetscTimeSubtract(&cpuTimer);
496     // CHKERRCUDA(cudaEventRecord(start, 0));
497     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
498     cudaStreamSynchronize(NULL);
499     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
500     CHKERRCUDA(cudaEventRecord(stop, 0));
501     // CHKERRCUDA(cudaEventSynchronize(stop));
502     //get the total elapsed time in ms
503     PetscTimeAdd(&cpuTimer);
504     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
505     else {
506       // CHKERRCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
507     }
508 
509     cpuTimer = 0.0;
510     PetscTimeSubtract(&cpuTimer);
511     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
512     cudaStreamSynchronize(NULL);
513     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
514     //get the total elapsed time in ms
515     PetscTimeAdd(&cpuTimer);
516     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
517 
518     cpuTimer = 0.0;
519     PetscTimeSubtract(&cpuTimer);
520     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
521     cudaStreamSynchronize(NULL);
522     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
523     //get the total elapsed time in ms
524     PetscTimeAdd(&cpuTimer);
525     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
526 
527     cpuTimer = 0.0;
528     PetscTimeSubtract(&cpuTimer);
529     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
530     cudaStreamSynchronize(NULL);
531     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
532     //get the total elapsed time in ms
533     PetscTimeAdd(&cpuTimer);
534     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
535   }
536 
537   if (1) { /* verify kernels */
538   float *h_a, *h_b, *h_c;
539   bool  errorSTREAMkernel = true;
540 
541   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
542     printf("Unable to allocate array h_a, exiting ...\n");
543     exit(1);
544   }
545   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
546     printf("Unable to allocate array h_b, exiting ...\n");
547     exit(1);
548   }
549 
550   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
551     printf("Unalbe to allocate array h_c, exiting ...\n");
552     exit(1);
553   }
554 
555   /*
556    * perform kernel, copy device memory into host memory and verify each
557    * device kernel output
558    */
559 
560   /* Initialize memory on the device */
561   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
562   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
563   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
564 
565   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
566   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
567   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
568   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
569   if (errorSTREAMkernel) {
570     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
571     exit(-2000);
572   }
573 
574   /* Initialize memory on the device */
575   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
576   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
577   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
578 
579   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
580   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
581   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
582   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
583   if (errorSTREAMkernel) {
584     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
585     exit(-3000);
586   }
587 
588   /* Initialize memory on the device */
589   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
590   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
591   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
592 
593   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
594   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
595   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
596   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
597   if (errorSTREAMkernel) {
598     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
599     exit(-4000);
600   }
601 
602   /* Initialize memory on the device */
603   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
604   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
605   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
606 
607   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
608   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
609   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
610   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
611   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
612   if (errorSTREAMkernel) {
613     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
614     exit(-5000);
615   }
616 
617   /* Initialize memory on the device */
618   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
619   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
620   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
621 
622   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
623   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
624   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
625   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
626   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
627   if (errorSTREAMkernel) {
628     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
629     exit(-6000);
630   }
631 
632   free(h_a);
633   free(h_b);
634   free(h_c);
635   }
636   /* continue from here */
637   printResultsReadable(times, sizeof(float));
638 
639   /* Free memory on device */
640   CHKERRCUDA(cudaFree(d_a));
641   CHKERRCUDA(cudaFree(d_b));
642   CHKERRCUDA(cudaFree(d_c));
643 
644   PetscFunctionReturn(0);
645 }
646 
647 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
648 {
649   double         *d_a, *d_b, *d_c;
650   int            k;
651   float          times[8][NTIMES];
652   double         scalar;
653   PetscErrorCode ierr;
654 
655   PetscFunctionBegin;
656   /* Allocate memory on device */
657   CHKERRCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N));
658   CHKERRCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N));
659   CHKERRCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N));
660 
661   /* Compute execution configuration */
662 
663   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
664   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
665   if (N % dimBlock.x != 0) dimGrid.x+=1;
666 
667   /* Initialize memory on the device */
668   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
669   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
670   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
671 
672   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
673   PetscLogDouble cpuTimer = 0.0;
674 
675   scalar=3.0;
676   for (k = 0; k < NTIMES; ++k) {
677     PetscTimeSubtract(&cpuTimer);
678     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
679     cudaStreamSynchronize(NULL);
680     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
681     //get the total elapsed time in ms
682     if (bDontUseGPUTiming) {
683       PetscTimeAdd(&cpuTimer);
684       times[0][k] = cpuTimer*1.e3;
685     }
686 
687     cpuTimer = 0.0;
688     PetscTimeSubtract(&cpuTimer);
689     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
690     cudaStreamSynchronize(NULL);
691     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
692     //get the total elapsed time in ms
693     if (bDontUseGPUTiming) {
694       PetscTimeAdd(&cpuTimer);
695       times[1][k] = cpuTimer*1.e3;
696     }
697 
698     cpuTimer = 0.0;
699     PetscTimeSubtract(&cpuTimer);
700     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
701     cudaStreamSynchronize(NULL);
702     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
703     //get the total elapsed time in ms
704     PetscTimeAdd(&cpuTimer);
705     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
706 
707     cpuTimer = 0.0;
708     PetscTimeSubtract(&cpuTimer);
709     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
710     cudaStreamSynchronize(NULL);
711     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
712     //get the total elapsed time in ms
713     PetscTimeAdd(&cpuTimer);
714     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
715 
716     cpuTimer = 0.0;
717     PetscTimeSubtract(&cpuTimer);
718     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
719     cudaStreamSynchronize(NULL);
720     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
721     //get the total elapsed time in ms
722     PetscTimeAdd(&cpuTimer);
723     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
724 
725     cpuTimer = 0.0;
726     PetscTimeSubtract(&cpuTimer);
727     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
728     cudaStreamSynchronize(NULL);
729     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
730     //get the total elapsed time in ms
731     PetscTimeAdd(&cpuTimer);
732     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
733 
734     cpuTimer = 0.0;
735     PetscTimeSubtract(&cpuTimer);
736     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
737     cudaStreamSynchronize(NULL);
738     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
739     //get the total elapsed time in ms
740     PetscTimeAdd(&cpuTimer);
741     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
742 
743     cpuTimer = 0.0;
744     PetscTimeSubtract(&cpuTimer);
745     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
746     cudaStreamSynchronize(NULL);
747     CHKERRMPI(MPI_Barrier(MPI_COMM_WORLD));
748     //get the total elapsed time in ms
749     PetscTimeAdd(&cpuTimer);
750     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
751   }
752 
753   if (1) { /* verify kernels */
754   double *h_a, *h_b, *h_c;
755   bool   errorSTREAMkernel = true;
756 
757   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
758     printf("Unable to allocate array h_a, exiting ...\n");
759     exit(1);
760   }
761   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
762     printf("Unable to allocate array h_b, exiting ...\n");
763     exit(1);
764   }
765 
766   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
767     printf("Unalbe to allocate array h_c, exiting ...\n");
768     exit(1);
769   }
770 
771   /*
772    * perform kernel, copy device memory into host memory and verify each
773    * device kernel output
774    */
775 
776   /* Initialize memory on the device */
777   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
778   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
779   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
780 
781   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
782   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
783   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
784   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
785   if (errorSTREAMkernel) {
786     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
787     exit(-2000);
788   }
789 
790   /* Initialize memory on the device */
791   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
792   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
793   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
794 
795   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
796   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
797   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
798   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
799   if (errorSTREAMkernel) {
800     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
801     exit(-3000);
802   }
803 
804   /* Initialize memory on the device */
805   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
806   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
807 
808   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
809   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
810   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
811   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
812   if (errorSTREAMkernel) {
813     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
814     exit(-4000);
815   }
816 
817   /* Initialize memory on the device */
818   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
819   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
820   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
821 
822   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
823   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
824   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
825   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
826   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
827   if (errorSTREAMkernel) {
828     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
829     exit(-5000);
830   }
831 
832   /* Initialize memory on the device */
833   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
834   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
835   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
836 
837   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
838   CHKERRCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
839   CHKERRCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
840   CHKERRCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
841   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
842   if (errorSTREAMkernel) {
843     CHKERRQ(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
844     exit(-6000);
845   }
846 
847   free(h_a);
848   free(h_b);
849   free(h_c);
850   }
851   /* continue from here */
852   printResultsReadable(times,sizeof(double));
853 
854   /* Free memory on device */
855   CHKERRCUDA(cudaFree(d_a));
856   CHKERRCUDA(cudaFree(d_b));
857   CHKERRCUDA(cudaFree(d_c));
858 
859   PetscFunctionReturn(0);
860 }
861 
862 ///////////////////////////////////////////////////////////////////////////
863 //Print Results to Screen and File
864 ///////////////////////////////////////////////////////////////////////////
865 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
866 {
867   PetscErrorCode ierr;
868   PetscInt       j, k;
869   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
870   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
871   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
872   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
873   const float    bytes_per_kernel[8] = {
874     2. * bsize * N,
875     2. * bsize * N,
876     2. * bsize * N,
877     2. * bsize * N,
878     3. * bsize * N,
879     3. * bsize * N,
880     3. * bsize * N,
881     3. * bsize * N
882   };
883   double         rate,irate;
884   int            rank,size;
885   PetscFunctionBegin;
886   CHKERRMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
887   CHKERRMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
888   /* --- SUMMARY --- */
889   for (k = 0; k < NTIMES; ++k) {
890     for (j = 0; j < 8; ++j) {
891       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
892       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
893       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
894     }
895   }
896   for (j = 0; j < 8; ++j) {
897     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
898   }
899   j = 7;
900   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
901   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
902   if (rank == 0) {
903     FILE *fd;
904     if (size == 1) {
905       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
906       fd = fopen("flops","w");
907       fprintf(fd,"%g\n",rate);
908       fclose(fd);
909     } else {
910       double prate;
911       fd = fopen("flops","r");
912       fscanf(fd,"%lg",&prate);
913       fclose(fd);
914       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
915     }
916   }
917 
918   PetscFunctionReturn(0);
919 }
920