xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision d52a580b706c59ca78066c1e38754e45b6b56e2b)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice_cuda.h>
28 
29 #define N      10000000
30 #define NTIMES 10
31 
32 #if !defined(MIN)
33   #define MIN(x, y) ((x) < (y) ? (x) : (y))
34 #endif
35 #if !defined(MAX)
36   #define MAX(x, y) ((x) > (y) ? (x) : (y))
37 #endif
38 
39 const float  flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41 
42 __global__ void set_array(float *a, float value, size_t len)
43 {
44   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45   while (idx < len) {
46     a[idx] = value;
47     idx += blockDim.x * gridDim.x;
48   }
49 }
50 
51 __global__ void set_array_double(double *a, double value, size_t len)
52 {
53   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54   while (idx < len) {
55     a[idx] = value;
56     idx += blockDim.x * gridDim.x;
57   }
58 }
59 
60 __global__ void STREAM_Copy(float *a, float *b, size_t len)
61 {
62   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63   while (idx < len) {
64     b[idx] = a[idx];
65     idx += blockDim.x * gridDim.x;
66   }
67 }
68 
69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70 {
71   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72   while (idx < len) {
73     b[idx] = a[idx];
74     idx += blockDim.x * gridDim.x;
75   }
76 }
77 
78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79 {
80   /*
81    * Ensure size of thread index space is as large as or greater than
82    * vector index space else return.
83    */
84   if (blockDim.x * gridDim.x < len) return;
85   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86   if (idx < len) b[idx] = a[idx];
87 }
88 
89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90 {
91   /*
92    * Ensure size of thread index space is as large as or greater than
93    * vector index space else return.
94    */
95   if (blockDim.x * gridDim.x < len) return;
96   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97   if (idx < len) b[idx] = a[idx];
98 }
99 
100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len)
101 {
102   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103   while (idx < len) {
104     b[idx] = scale * a[idx];
105     idx += blockDim.x * gridDim.x;
106   }
107 }
108 
109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len)
110 {
111   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112   while (idx < len) {
113     b[idx] = scale * a[idx];
114     idx += blockDim.x * gridDim.x;
115   }
116 }
117 
118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len)
119 {
120   /*
121    * Ensure size of thread index space is as large as or greater than
122    * vector index space else return.
123    */
124   if (blockDim.x * gridDim.x < len) return;
125   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126   if (idx < len) b[idx] = scale * a[idx];
127 }
128 
129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len)
130 {
131   /*
132    * Ensure size of thread index space is as large as or greater than
133    * vector index space else return.
134    */
135   if (blockDim.x * gridDim.x < len) return;
136   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137   if (idx < len) b[idx] = scale * a[idx];
138 }
139 
140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len)
141 {
142   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143   while (idx < len) {
144     c[idx] = a[idx] + b[idx];
145     idx += blockDim.x * gridDim.x;
146   }
147 }
148 
149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len)
150 {
151   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152   while (idx < len) {
153     c[idx] = a[idx] + b[idx];
154     idx += blockDim.x * gridDim.x;
155   }
156 }
157 
158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len)
159 {
160   /*
161    * Ensure size of thread index space is as large as or greater than
162    * vector index space else return.
163    */
164   if (blockDim.x * gridDim.x < len) return;
165   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166   if (idx < len) c[idx] = a[idx] + b[idx];
167 }
168 
169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len)
170 {
171   /*
172    * Ensure size of thread index space is as large as or greater than
173    * vector index space else return.
174    */
175   if (blockDim.x * gridDim.x < len) return;
176   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177   if (idx < len) c[idx] = a[idx] + b[idx];
178 }
179 
180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181 {
182   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183   while (idx < len) {
184     c[idx] = a[idx] + scalar * b[idx];
185     idx += blockDim.x * gridDim.x;
186   }
187 }
188 
189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190 {
191   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192   while (idx < len) {
193     c[idx] = a[idx] + scalar * b[idx];
194     idx += blockDim.x * gridDim.x;
195   }
196 }
197 
198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199 {
200   /*
201    * Ensure size of thread index space is as large as or greater than
202    * vector index space else return.
203    */
204   if (blockDim.x * gridDim.x < len) return;
205   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
207 }
208 
209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210 {
211   /*
212    * Ensure size of thread index space is as large as or greater than
213    * vector index space else return.
214    */
215   if (blockDim.x * gridDim.x < len) return;
216   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
218 }
219 
220 /* Host side verification routines */
221 bool STREAM_Copy_verify(float *a, float *b, size_t len)
222 {
223   size_t idx;
224   bool   bDifferent = false;
225 
226   for (idx = 0; idx < len && !bDifferent; idx++) {
227     float expectedResult     = a[idx];
228     float diffResultExpected = (b[idx] - expectedResult);
229     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
230     /* element-wise relative error determination */
231     bDifferent = (relErrorULPS > 2.f);
232   }
233 
234   return bDifferent;
235 }
236 
237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238 {
239   size_t idx;
240   bool   bDifferent = false;
241 
242   for (idx = 0; idx < len && !bDifferent; idx++) {
243     double expectedResult     = a[idx];
244     double diffResultExpected = (b[idx] - expectedResult);
245     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps;
246     /* element-wise relative error determination */
247     bDifferent = (relErrorULPS > 2.);
248   }
249 
250   return bDifferent;
251 }
252 
253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254 {
255   size_t idx;
256   bool   bDifferent = false;
257 
258   for (idx = 0; idx < len && !bDifferent; idx++) {
259     float expectedResult     = scale * a[idx];
260     float diffResultExpected = (b[idx] - expectedResult);
261     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
262     /* element-wise relative error determination */
263     bDifferent = (relErrorULPS > 2.f);
264   }
265 
266   return bDifferent;
267 }
268 
269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270 {
271   size_t idx;
272   bool   bDifferent = false;
273 
274   for (idx = 0; idx < len && !bDifferent; idx++) {
275     double expectedResult     = scale * a[idx];
276     double diffResultExpected = (b[idx] - expectedResult);
277     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
278     /* element-wise relative error determination */
279     bDifferent = (relErrorULPS > 2.);
280   }
281 
282   return bDifferent;
283 }
284 
285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286 {
287   size_t idx;
288   bool   bDifferent = false;
289 
290   for (idx = 0; idx < len && !bDifferent; idx++) {
291     float expectedResult     = a[idx] + b[idx];
292     float diffResultExpected = (c[idx] - expectedResult);
293     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
294     /* element-wise relative error determination */
295     bDifferent = (relErrorULPS > 2.f);
296   }
297 
298   return bDifferent;
299 }
300 
301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302 {
303   size_t idx;
304   bool   bDifferent = false;
305 
306   for (idx = 0; idx < len && !bDifferent; idx++) {
307     double expectedResult     = a[idx] + b[idx];
308     double diffResultExpected = (c[idx] - expectedResult);
309     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
310     /* element-wise relative error determination */
311     bDifferent = (relErrorULPS > 2.);
312   }
313 
314   return bDifferent;
315 }
316 
317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318 {
319   size_t idx;
320   bool   bDifferent = false;
321 
322   for (idx = 0; idx < len && !bDifferent; idx++) {
323     float expectedResult     = a[idx] + scalar * b[idx];
324     float diffResultExpected = (c[idx] - expectedResult);
325     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
326     /* element-wise relative error determination */
327     bDifferent = (relErrorULPS > 3.f);
328   }
329 
330   return bDifferent;
331 }
332 
333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334 {
335   size_t idx;
336   bool   bDifferent = false;
337 
338   for (idx = 0; idx < len && !bDifferent; idx++) {
339     double expectedResult     = a[idx] + scalar * b[idx];
340     double diffResultExpected = (c[idx] - expectedResult);
341     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
342     /* element-wise relative error determination */
343     bDifferent = (relErrorULPS > 3.);
344   }
345 
346   return bDifferent;
347 }
348 
349 /* forward declarations */
350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354 
355 int main(int argc, char *argv[])
356 {
357   PetscInt        device    = 0;
358   PetscBool       runDouble = PETSC_TRUE;
359   const PetscBool cpuTiming = PETSC_TRUE; // must be true
360   PetscErrorCode  ierr;
361 
362   PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
363 
364   PetscCall(PetscInitialize(&argc, &argv, 0, help));
365 
366   PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
367   PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0));
368   PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL));
369   PetscOptionsEnd();
370 
371   ierr = setupStream(device, runDouble, cpuTiming);
372   if (ierr) PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
373   PetscCall(PetscFinalize());
374   return 0;
375 }
376 
377 ///////////////////////////////////////////////////////////////////////////////
378 //Run the appropriate tests
379 ///////////////////////////////////////////////////////////////////////////////
380 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
381 {
382   PetscInt iNumThreadsPerBlock = 128;
383 
384   PetscFunctionBegin;
385   // Check device
386   {
387     int deviceCount;
388 
389     cudaGetDeviceCount(&deviceCount);
390     if (deviceCount == 0) {
391       PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
392       return -1000;
393     }
394 
395     if (deviceNum >= deviceCount || deviceNum < 0) {
396       PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
397       deviceNum = 0;
398     }
399   }
400 
401   cudaSetDevice(deviceNum);
402   // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
403   cudaDeviceProp deviceProp;
404   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
405     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
406     return -1;
407   }
408 
409   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
410     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
411     return -1;
412   }
413   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
414   else iNumThreadsPerBlock = 128;                                                /* GF100 architecture / 32 CUDA Cores per MP */
415 
416   if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
417   else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
418   PetscFunctionReturn(PETSC_SUCCESS);
419 }
420 
421 ///////////////////////////////////////////////////////////////////////////
422 // runStream
423 ///////////////////////////////////////////////////////////////////////////
424 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
425 {
426   float *d_a, *d_b, *d_c;
427   int    k;
428   float  times[8][NTIMES];
429   float  scalar;
430 
431   PetscFunctionBegin;
432   /* Allocate memory on device */
433   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N));
434   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N));
435   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N));
436 
437   /* Compute execution configuration */
438 
439   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
440   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
441   if (N % dimBlock.x != 0) dimGrid.x += 1;
442 
443   /* Initialize memory on the device */
444   set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
445   set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
446   set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
447 
448   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
449   PetscLogDouble cpuTimer = 0.0;
450 
451   scalar = 3.0f;
452   for (k = 0; k < NTIMES; ++k) {
453     PetscTimeSubtract(&cpuTimer);
454     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
455     cudaStreamSynchronize(NULL);
456     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
457     PetscTimeAdd(&cpuTimer);
458     if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec
459 
460     cpuTimer = 0.0;
461     PetscTimeSubtract(&cpuTimer);
462     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
463     cudaStreamSynchronize(NULL);
464     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
465     //get the total elapsed time in ms
466     PetscTimeAdd(&cpuTimer);
467     if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3;
468 
469     cpuTimer = 0.0;
470     PetscTimeSubtract(&cpuTimer);
471     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
472     cudaStreamSynchronize(NULL);
473     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
474     //get the total elapsed time in ms
475     PetscTimeAdd(&cpuTimer);
476     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
477 
478     cpuTimer = 0.0;
479     PetscTimeSubtract(&cpuTimer);
480     STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
481     cudaStreamSynchronize(NULL);
482     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
483     //get the total elapsed time in ms
484     PetscTimeAdd(&cpuTimer);
485     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
486 
487     cpuTimer = 0.0;
488     PetscTimeSubtract(&cpuTimer);
489     // PetscCallCUDA(cudaEventRecord(start, 0));
490     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
491     cudaStreamSynchronize(NULL);
492     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
493     PetscCallCUDA(cudaEventRecord(stop, 0));
494     // PetscCallCUDA(cudaEventSynchronize(stop));
495     //get the total elapsed time in ms
496     PetscTimeAdd(&cpuTimer);
497     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
498     else {
499       // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
500     }
501 
502     cpuTimer = 0.0;
503     PetscTimeSubtract(&cpuTimer);
504     STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
505     cudaStreamSynchronize(NULL);
506     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
507     //get the total elapsed time in ms
508     PetscTimeAdd(&cpuTimer);
509     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
510 
511     cpuTimer = 0.0;
512     PetscTimeSubtract(&cpuTimer);
513     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
514     cudaStreamSynchronize(NULL);
515     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
516     //get the total elapsed time in ms
517     PetscTimeAdd(&cpuTimer);
518     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
519 
520     cpuTimer = 0.0;
521     PetscTimeSubtract(&cpuTimer);
522     STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
523     cudaStreamSynchronize(NULL);
524     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
525     //get the total elapsed time in ms
526     PetscTimeAdd(&cpuTimer);
527     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
528   }
529 
530   if (1) { /* verify kernels */
531     float *h_a, *h_b, *h_c;
532     bool   errorSTREAMkernel = true;
533 
534     if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
535       printf("Unable to allocate array h_a, exiting ...\n");
536       exit(1);
537     }
538     if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
539       printf("Unable to allocate array h_b, exiting ...\n");
540       exit(1);
541     }
542 
543     if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
544       printf("Unalbe to allocate array h_c, exiting ...\n");
545       exit(1);
546     }
547 
548     /*
549    * perform kernel, copy device memory into host memory and verify each
550    * device kernel output
551    */
552 
553     /* Initialize memory on the device */
554     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
555     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
556     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
557 
558     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
559     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
560     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
561     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
562     if (errorSTREAMkernel) {
563       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
564       exit(-2000);
565     }
566 
567     /* Initialize memory on the device */
568     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
569     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
570     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
571 
572     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
573     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
574     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
575     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
576     if (errorSTREAMkernel) {
577       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
578       exit(-3000);
579     }
580 
581     /* Initialize memory on the device */
582     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
583     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
584     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
585 
586     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
587     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
588     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
589     errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
590     if (errorSTREAMkernel) {
591       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
592       exit(-4000);
593     }
594 
595     /* Initialize memory on the device */
596     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
597     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
598     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
599 
600     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
601     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
602     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
603     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
604     errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
605     if (errorSTREAMkernel) {
606       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
607       exit(-5000);
608     }
609 
610     /* Initialize memory on the device */
611     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
612     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
613     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
614 
615     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
616     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
617     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
618     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
619     errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
620     if (errorSTREAMkernel) {
621       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
622       exit(-6000);
623     }
624 
625     free(h_a);
626     free(h_b);
627     free(h_c);
628   }
629   /* continue from here */
630   printResultsReadable(times, sizeof(float));
631 
632   /* Free memory on device */
633   PetscCallCUDA(cudaFree(d_a));
634   PetscCallCUDA(cudaFree(d_b));
635   PetscCallCUDA(cudaFree(d_c));
636   PetscFunctionReturn(PETSC_SUCCESS);
637 }
638 
639 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
640 {
641   double *d_a, *d_b, *d_c;
642   int     k;
643   float   times[8][NTIMES];
644   double  scalar;
645 
646   PetscFunctionBegin;
647   /* Allocate memory on device */
648   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N));
649   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N));
650   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N));
651 
652   /* Compute execution configuration */
653 
654   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
655   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
656   if (N % dimBlock.x != 0) dimGrid.x += 1;
657 
658   /* Initialize memory on the device */
659   set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
660   set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
661   set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
662 
663   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
664   PetscLogDouble cpuTimer = 0.0;
665 
666   scalar = 3.0;
667   for (k = 0; k < NTIMES; ++k) {
668     PetscTimeSubtract(&cpuTimer);
669     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
670     cudaStreamSynchronize(NULL);
671     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
672     //get the total elapsed time in ms
673     if (bDontUseGPUTiming) {
674       PetscTimeAdd(&cpuTimer);
675       times[0][k] = cpuTimer * 1.e3;
676     }
677 
678     cpuTimer = 0.0;
679     PetscTimeSubtract(&cpuTimer);
680     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
681     cudaStreamSynchronize(NULL);
682     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
683     //get the total elapsed time in ms
684     if (bDontUseGPUTiming) {
685       PetscTimeAdd(&cpuTimer);
686       times[1][k] = cpuTimer * 1.e3;
687     }
688 
689     cpuTimer = 0.0;
690     PetscTimeSubtract(&cpuTimer);
691     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
692     cudaStreamSynchronize(NULL);
693     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
694     //get the total elapsed time in ms
695     PetscTimeAdd(&cpuTimer);
696     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
697 
698     cpuTimer = 0.0;
699     PetscTimeSubtract(&cpuTimer);
700     STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
701     cudaStreamSynchronize(NULL);
702     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
703     //get the total elapsed time in ms
704     PetscTimeAdd(&cpuTimer);
705     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
706 
707     cpuTimer = 0.0;
708     PetscTimeSubtract(&cpuTimer);
709     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
710     cudaStreamSynchronize(NULL);
711     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
712     //get the total elapsed time in ms
713     PetscTimeAdd(&cpuTimer);
714     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
715 
716     cpuTimer = 0.0;
717     PetscTimeSubtract(&cpuTimer);
718     STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
719     cudaStreamSynchronize(NULL);
720     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
721     //get the total elapsed time in ms
722     PetscTimeAdd(&cpuTimer);
723     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
724 
725     cpuTimer = 0.0;
726     PetscTimeSubtract(&cpuTimer);
727     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
728     cudaStreamSynchronize(NULL);
729     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
730     //get the total elapsed time in ms
731     PetscTimeAdd(&cpuTimer);
732     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
733 
734     cpuTimer = 0.0;
735     PetscTimeSubtract(&cpuTimer);
736     STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
737     cudaStreamSynchronize(NULL);
738     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
739     //get the total elapsed time in ms
740     PetscTimeAdd(&cpuTimer);
741     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
742   }
743 
744   if (1) { /* verify kernels */
745     double *h_a, *h_b, *h_c;
746     bool    errorSTREAMkernel = true;
747 
748     if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
749       printf("Unable to allocate array h_a, exiting ...\n");
750       exit(1);
751     }
752     if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
753       printf("Unable to allocate array h_b, exiting ...\n");
754       exit(1);
755     }
756 
757     if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
758       printf("Unalbe to allocate array h_c, exiting ...\n");
759       exit(1);
760     }
761 
762     /*
763    * perform kernel, copy device memory into host memory and verify each
764    * device kernel output
765    */
766 
767     /* Initialize memory on the device */
768     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
769     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
770     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
771 
772     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
773     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
774     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
775     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
776     if (errorSTREAMkernel) {
777       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
778       exit(-2000);
779     }
780 
781     /* Initialize memory on the device */
782     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
783     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
784     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
785 
786     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
787     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
788     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
789     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
790     if (errorSTREAMkernel) {
791       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
792       exit(-3000);
793     }
794 
795     /* Initialize memory on the device */
796     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
797     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
798 
799     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
800     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
801     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
802     errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
803     if (errorSTREAMkernel) {
804       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
805       exit(-4000);
806     }
807 
808     /* Initialize memory on the device */
809     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
810     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
811     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
812 
813     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
814     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
815     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
816     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
817     errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
818     if (errorSTREAMkernel) {
819       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
820       exit(-5000);
821     }
822 
823     /* Initialize memory on the device */
824     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
825     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
826     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
827 
828     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
829     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
830     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
831     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
832     errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
833     if (errorSTREAMkernel) {
834       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
835       exit(-6000);
836     }
837 
838     free(h_a);
839     free(h_b);
840     free(h_c);
841   }
842   /* continue from here */
843   printResultsReadable(times, sizeof(double));
844 
845   /* Free memory on device */
846   PetscCallCUDA(cudaFree(d_a));
847   PetscCallCUDA(cudaFree(d_b));
848   PetscCallCUDA(cudaFree(d_c));
849   PetscFunctionReturn(PETSC_SUCCESS);
850 }
851 
852 ///////////////////////////////////////////////////////////////////////////
853 //Print Results to Screen and File
854 ///////////////////////////////////////////////////////////////////////////
855 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
856 {
857   PetscErrorCode ierr;
858   PetscInt       j, k;
859   float          avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
860   float          maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
861   float          mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30};
862   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
863   const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N};
864   double      rate, irate;
865   int         rank, size;
866 
867   PetscFunctionBegin;
868   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
869   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size));
870   /* --- SUMMARY --- */
871   for (k = 0; k < NTIMES; ++k) {
872     for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) {
873       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
874       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
875       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
876     }
877   }
878   for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) avgtime[j] = avgtime[j] / (float)(NTIMES - 1);
879   j     = 7;
880   irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j];
881   ierr  = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
882   if (rank == 0) {
883     FILE *fd;
884     if (size == 1) {
885       printf("%d %11.4f   Rate (MB/s)\n", size, rate);
886       fd = fopen("flops", "w");
887       fprintf(fd, "%g\n", rate);
888       fclose(fd);
889     } else {
890       double prate;
891       fd = fopen("flops", "r");
892       fscanf(fd, "%lg", &prate);
893       fclose(fd);
894       printf("%d %11.4f   Rate (MB/s) %g\n", size, rate, rate / prate);
895     }
896   }
897   PetscFunctionReturn(PETSC_SUCCESS);
898 }
899