xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision 3307d110e72ee4e6d2468971620073eb5ff93529)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice.h>
28 
29 #define N        10000000
30 #define NTIMES   10
31 
32 # ifndef MIN
33 # define MIN(x,y) ((x)<(y) ? (x) : (y))
34 # endif
35 # ifndef MAX
36 # define MAX(x,y) ((x)>(y) ? (x) : (y))
37 # endif
38 
39 const float  flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41 
42 __global__ void set_array(float *a,  float value, size_t len)
43 {
44   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45   while (idx < len) {
46     a[idx] = value;
47     idx   += blockDim.x * gridDim.x;
48   }
49 }
50 
51 __global__ void set_array_double(double *a,  double value, size_t len)
52 {
53   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54   while (idx < len) {
55     a[idx] = value;
56     idx   += blockDim.x * gridDim.x;
57   }
58 }
59 
60 __global__ void STREAM_Copy(float *a, float *b, size_t len)
61 {
62   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63   while (idx < len) {
64     b[idx] = a[idx];
65     idx   += blockDim.x * gridDim.x;
66   }
67 }
68 
69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70 {
71   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72   while (idx < len) {
73     b[idx] = a[idx];
74     idx   += blockDim.x * gridDim.x;
75   }
76 }
77 
78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79 {
80   /*
81    * Ensure size of thread index space is as large as or greater than
82    * vector index space else return.
83    */
84   if (blockDim.x * gridDim.x < len) return;
85   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86   if (idx < len) b[idx] = a[idx];
87 }
88 
89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90 {
91   /*
92    * Ensure size of thread index space is as large as or greater than
93    * vector index space else return.
94    */
95   if (blockDim.x * gridDim.x < len) return;
96   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97   if (idx < len) b[idx] = a[idx];
98 }
99 
100 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
101 {
102   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103   while (idx < len) {
104     b[idx] = scale* a[idx];
105     idx   += blockDim.x * gridDim.x;
106   }
107 }
108 
109 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
110 {
111   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112   while (idx < len) {
113     b[idx] = scale* a[idx];
114     idx   += blockDim.x * gridDim.x;
115   }
116 }
117 
118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
119 {
120   /*
121    * Ensure size of thread index space is as large as or greater than
122    * vector index space else return.
123    */
124   if (blockDim.x * gridDim.x < len) return;
125   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126   if (idx < len) b[idx] = scale* a[idx];
127 }
128 
129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
130 {
131   /*
132    * Ensure size of thread index space is as large as or greater than
133    * vector index space else return.
134    */
135   if (blockDim.x * gridDim.x < len) return;
136   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137   if (idx < len) b[idx] = scale* a[idx];
138 }
139 
140 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
141 {
142   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143   while (idx < len) {
144     c[idx] = a[idx]+b[idx];
145     idx   += blockDim.x * gridDim.x;
146   }
147 }
148 
149 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
150 {
151   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152   while (idx < len) {
153     c[idx] = a[idx]+b[idx];
154     idx   += blockDim.x * gridDim.x;
155   }
156 }
157 
158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
159 {
160   /*
161    * Ensure size of thread index space is as large as or greater than
162    * vector index space else return.
163    */
164   if (blockDim.x * gridDim.x < len) return;
165   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166   if (idx < len) c[idx] = a[idx]+b[idx];
167 }
168 
169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
170 {
171   /*
172    * Ensure size of thread index space is as large as or greater than
173    * vector index space else return.
174    */
175   if (blockDim.x * gridDim.x < len) return;
176   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177   if (idx < len) c[idx] = a[idx]+b[idx];
178 }
179 
180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181 {
182   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183   while (idx < len) {
184     c[idx] = a[idx]+scalar*b[idx];
185     idx   += blockDim.x * gridDim.x;
186   }
187 }
188 
189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190 {
191   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192   while (idx < len) {
193     c[idx] = a[idx]+scalar*b[idx];
194     idx   += blockDim.x * gridDim.x;
195   }
196 }
197 
198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199 {
200   /*
201    * Ensure size of thread index space is as large as or greater than
202    * vector index space else return.
203    */
204   if (blockDim.x * gridDim.x < len) return;
205   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
207 }
208 
209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210 {
211   /*
212    * Ensure size of thread index space is as large as or greater than
213    * vector index space else return.
214    */
215   if (blockDim.x * gridDim.x < len) return;
216   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
218 }
219 
220 /* Host side verification routines */
221 bool STREAM_Copy_verify(float *a, float *b, size_t len)
222 {
223   size_t idx;
224   bool   bDifferent = false;
225 
226   for (idx = 0; idx < len && !bDifferent; idx++) {
227     float expectedResult     = a[idx];
228     float diffResultExpected = (b[idx] - expectedResult);
229     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230     /* element-wise relative error determination */
231     bDifferent = (relErrorULPS > 2.f);
232   }
233 
234   return bDifferent;
235 }
236 
237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238 {
239   size_t idx;
240   bool   bDifferent = false;
241 
242   for (idx = 0; idx < len && !bDifferent; idx++) {
243     double expectedResult     = a[idx];
244     double diffResultExpected = (b[idx] - expectedResult);
245     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
246     /* element-wise relative error determination */
247     bDifferent = (relErrorULPS > 2.);
248   }
249 
250   return bDifferent;
251 }
252 
253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254 {
255   size_t idx;
256   bool   bDifferent = false;
257 
258   for (idx = 0; idx < len && !bDifferent; idx++) {
259     float expectedResult     = scale*a[idx];
260     float diffResultExpected = (b[idx] - expectedResult);
261     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
262     /* element-wise relative error determination */
263     bDifferent = (relErrorULPS > 2.f);
264   }
265 
266   return bDifferent;
267 }
268 
269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270 {
271   size_t idx;
272   bool   bDifferent = false;
273 
274   for (idx = 0; idx < len && !bDifferent; idx++) {
275     double expectedResult     = scale*a[idx];
276     double diffResultExpected = (b[idx] - expectedResult);
277     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
278     /* element-wise relative error determination */
279     bDifferent = (relErrorULPS > 2.);
280   }
281 
282   return bDifferent;
283 }
284 
285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286 {
287   size_t idx;
288   bool   bDifferent = false;
289 
290   for (idx = 0; idx < len && !bDifferent; idx++) {
291     float expectedResult     = a[idx] + b[idx];
292     float diffResultExpected = (c[idx] - expectedResult);
293     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
294     /* element-wise relative error determination */
295     bDifferent = (relErrorULPS > 2.f);
296   }
297 
298   return bDifferent;
299 }
300 
301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302 {
303   size_t idx;
304   bool   bDifferent = false;
305 
306   for (idx = 0; idx < len && !bDifferent; idx++) {
307     double expectedResult     = a[idx] + b[idx];
308     double diffResultExpected = (c[idx] - expectedResult);
309     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
310     /* element-wise relative error determination */
311     bDifferent = (relErrorULPS > 2.);
312   }
313 
314   return bDifferent;
315 }
316 
317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318 {
319   size_t idx;
320   bool   bDifferent = false;
321 
322   for (idx = 0; idx < len && !bDifferent; idx++) {
323     float expectedResult     = a[idx] + scalar*b[idx];
324     float diffResultExpected = (c[idx] - expectedResult);
325     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
326     /* element-wise relative error determination */
327     bDifferent = (relErrorULPS > 3.f);
328   }
329 
330   return bDifferent;
331 }
332 
333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334 {
335   size_t idx;
336   bool   bDifferent = false;
337 
338   for (idx = 0; idx < len && !bDifferent; idx++) {
339     double expectedResult     = a[idx] + scalar*b[idx];
340     double diffResultExpected = (c[idx] - expectedResult);
341     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
342     /* element-wise relative error determination */
343     bDifferent = (relErrorULPS > 3.);
344   }
345 
346   return bDifferent;
347 }
348 
349 /* forward declarations */
350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354 
355 int main(int argc, char *argv[])
356 {
357   PetscInt       device    = 0;
358   PetscBool      runDouble = PETSC_TRUE;
359   const PetscBool cpuTiming = PETSC_TRUE; // must be true
360   PetscErrorCode ierr;
361 
362   PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
363 
364   PetscCall(PetscInitialize(&argc, &argv, 0, help));
365 
366   PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
367   PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0));
368   PetscCall(PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL));
369   PetscOptionsEnd();
370 
371   ierr = setupStream(device, runDouble, cpuTiming);
372   if (ierr) {
373     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
374   }
375   PetscCall(PetscFinalize());
376   return 0;
377 }
378 
379 ///////////////////////////////////////////////////////////////////////////////
380 //Run the appropriate tests
381 ///////////////////////////////////////////////////////////////////////////////
382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
383 {
384   PetscInt       iNumThreadsPerBlock = 128;
385 
386   PetscFunctionBegin;
387   // Check device
388   {
389     int deviceCount;
390 
391     cudaGetDeviceCount(&deviceCount);
392     if (deviceCount == 0) {
393       PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
394       return -1000;
395     }
396 
397     if (deviceNum >= deviceCount || deviceNum < 0) {
398       PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
399       deviceNum = 0;
400     }
401   }
402 
403   cudaSetDevice(deviceNum);
404   // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
405   cudaDeviceProp deviceProp;
406   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
407     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
408     return -1;
409   }
410 
411   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
412     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
413     return -1;
414   }
415   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
416   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
417 
418   if (runDouble) {
419     PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
420   } else {
421     PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
422   }
423   PetscFunctionReturn(0);
424 }
425 
426 ///////////////////////////////////////////////////////////////////////////
427 // runStream
428 ///////////////////////////////////////////////////////////////////////////
429 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
430 {
431   float          *d_a, *d_b, *d_c;
432   int            k;
433   float          times[8][NTIMES];
434   float          scalar;
435 
436   PetscFunctionBegin;
437   /* Allocate memory on device */
438   PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N));
439   PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N));
440   PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N));
441 
442   /* Compute execution configuration */
443 
444   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
445   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
446   if (N % dimBlock.x != 0) dimGrid.x+=1;
447 
448   /* Initialize memory on the device */
449   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
450   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
451   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
452 
453   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
454   PetscLogDouble cpuTimer = 0.0;
455 
456   scalar=3.0f;
457   for (k = 0; k < NTIMES; ++k) {
458     PetscTimeSubtract(&cpuTimer);
459     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
460     cudaStreamSynchronize(NULL);
461     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
462     PetscTimeAdd(&cpuTimer);
463     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
464 
465     cpuTimer = 0.0;
466     PetscTimeSubtract(&cpuTimer);
467     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
468     cudaStreamSynchronize(NULL);
469     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
470     //get the total elapsed time in ms
471     PetscTimeAdd(&cpuTimer);
472     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
473 
474     cpuTimer = 0.0;
475     PetscTimeSubtract(&cpuTimer);
476     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
477     cudaStreamSynchronize(NULL);
478     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
479     //get the total elapsed time in ms
480     PetscTimeAdd(&cpuTimer);
481     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
482 
483     cpuTimer = 0.0;
484     PetscTimeSubtract(&cpuTimer);
485     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
486     cudaStreamSynchronize(NULL);
487     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
488     //get the total elapsed time in ms
489     PetscTimeAdd(&cpuTimer);
490     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
491 
492     cpuTimer = 0.0;
493     PetscTimeSubtract(&cpuTimer);
494     // PetscCallCUDA(cudaEventRecord(start, 0));
495     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
496     cudaStreamSynchronize(NULL);
497     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
498     PetscCallCUDA(cudaEventRecord(stop, 0));
499     // PetscCallCUDA(cudaEventSynchronize(stop));
500     //get the total elapsed time in ms
501     PetscTimeAdd(&cpuTimer);
502     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
503     else {
504       // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
505     }
506 
507     cpuTimer = 0.0;
508     PetscTimeSubtract(&cpuTimer);
509     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
510     cudaStreamSynchronize(NULL);
511     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
512     //get the total elapsed time in ms
513     PetscTimeAdd(&cpuTimer);
514     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
515 
516     cpuTimer = 0.0;
517     PetscTimeSubtract(&cpuTimer);
518     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
519     cudaStreamSynchronize(NULL);
520     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
521     //get the total elapsed time in ms
522     PetscTimeAdd(&cpuTimer);
523     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
524 
525     cpuTimer = 0.0;
526     PetscTimeSubtract(&cpuTimer);
527     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
528     cudaStreamSynchronize(NULL);
529     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
530     //get the total elapsed time in ms
531     PetscTimeAdd(&cpuTimer);
532     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
533   }
534 
535   if (1) { /* verify kernels */
536   float *h_a, *h_b, *h_c;
537   bool  errorSTREAMkernel = true;
538 
539   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
540     printf("Unable to allocate array h_a, exiting ...\n");
541     exit(1);
542   }
543   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
544     printf("Unable to allocate array h_b, exiting ...\n");
545     exit(1);
546   }
547 
548   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
549     printf("Unalbe to allocate array h_c, exiting ...\n");
550     exit(1);
551   }
552 
553   /*
554    * perform kernel, copy device memory into host memory and verify each
555    * device kernel output
556    */
557 
558   /* Initialize memory on the device */
559   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
560   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
561   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
562 
563   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
564   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
565   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
566   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
567   if (errorSTREAMkernel) {
568     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
569     exit(-2000);
570   }
571 
572   /* Initialize memory on the device */
573   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
574   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
575   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
576 
577   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
578   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
579   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
580   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
581   if (errorSTREAMkernel) {
582     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
583     exit(-3000);
584   }
585 
586   /* Initialize memory on the device */
587   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
588   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
589   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
590 
591   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
592   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
593   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
594   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
595   if (errorSTREAMkernel) {
596     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
597     exit(-4000);
598   }
599 
600   /* Initialize memory on the device */
601   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
602   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
603   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
604 
605   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
606   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
607   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
608   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
609   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
610   if (errorSTREAMkernel) {
611     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
612     exit(-5000);
613   }
614 
615   /* Initialize memory on the device */
616   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
617   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
618   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
619 
620   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
621   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
622   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
623   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
624   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
625   if (errorSTREAMkernel) {
626     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
627     exit(-6000);
628   }
629 
630   free(h_a);
631   free(h_b);
632   free(h_c);
633   }
634   /* continue from here */
635   printResultsReadable(times, sizeof(float));
636 
637   /* Free memory on device */
638   PetscCallCUDA(cudaFree(d_a));
639   PetscCallCUDA(cudaFree(d_b));
640   PetscCallCUDA(cudaFree(d_c));
641 
642   PetscFunctionReturn(0);
643 }
644 
645 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
646 {
647   double         *d_a, *d_b, *d_c;
648   int            k;
649   float          times[8][NTIMES];
650   double         scalar;
651 
652   PetscFunctionBegin;
653   /* Allocate memory on device */
654   PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N));
655   PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N));
656   PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N));
657 
658   /* Compute execution configuration */
659 
660   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
661   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
662   if (N % dimBlock.x != 0) dimGrid.x+=1;
663 
664   /* Initialize memory on the device */
665   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
666   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
667   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
668 
669   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
670   PetscLogDouble cpuTimer = 0.0;
671 
672   scalar=3.0;
673   for (k = 0; k < NTIMES; ++k) {
674     PetscTimeSubtract(&cpuTimer);
675     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
676     cudaStreamSynchronize(NULL);
677     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
678     //get the total elapsed time in ms
679     if (bDontUseGPUTiming) {
680       PetscTimeAdd(&cpuTimer);
681       times[0][k] = cpuTimer*1.e3;
682     }
683 
684     cpuTimer = 0.0;
685     PetscTimeSubtract(&cpuTimer);
686     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
687     cudaStreamSynchronize(NULL);
688     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
689     //get the total elapsed time in ms
690     if (bDontUseGPUTiming) {
691       PetscTimeAdd(&cpuTimer);
692       times[1][k] = cpuTimer*1.e3;
693     }
694 
695     cpuTimer = 0.0;
696     PetscTimeSubtract(&cpuTimer);
697     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
698     cudaStreamSynchronize(NULL);
699     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
700     //get the total elapsed time in ms
701     PetscTimeAdd(&cpuTimer);
702     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
703 
704     cpuTimer = 0.0;
705     PetscTimeSubtract(&cpuTimer);
706     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
707     cudaStreamSynchronize(NULL);
708     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
709     //get the total elapsed time in ms
710     PetscTimeAdd(&cpuTimer);
711     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
712 
713     cpuTimer = 0.0;
714     PetscTimeSubtract(&cpuTimer);
715     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
716     cudaStreamSynchronize(NULL);
717     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
718     //get the total elapsed time in ms
719     PetscTimeAdd(&cpuTimer);
720     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
721 
722     cpuTimer = 0.0;
723     PetscTimeSubtract(&cpuTimer);
724     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
725     cudaStreamSynchronize(NULL);
726     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
727     //get the total elapsed time in ms
728     PetscTimeAdd(&cpuTimer);
729     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
730 
731     cpuTimer = 0.0;
732     PetscTimeSubtract(&cpuTimer);
733     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
734     cudaStreamSynchronize(NULL);
735     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
736     //get the total elapsed time in ms
737     PetscTimeAdd(&cpuTimer);
738     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
739 
740     cpuTimer = 0.0;
741     PetscTimeSubtract(&cpuTimer);
742     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
743     cudaStreamSynchronize(NULL);
744     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
745     //get the total elapsed time in ms
746     PetscTimeAdd(&cpuTimer);
747     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
748   }
749 
750   if (1) { /* verify kernels */
751   double *h_a, *h_b, *h_c;
752   bool   errorSTREAMkernel = true;
753 
754   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
755     printf("Unable to allocate array h_a, exiting ...\n");
756     exit(1);
757   }
758   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
759     printf("Unable to allocate array h_b, exiting ...\n");
760     exit(1);
761   }
762 
763   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
764     printf("Unalbe to allocate array h_c, exiting ...\n");
765     exit(1);
766   }
767 
768   /*
769    * perform kernel, copy device memory into host memory and verify each
770    * device kernel output
771    */
772 
773   /* Initialize memory on the device */
774   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
775   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
776   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
777 
778   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
779   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
780   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
781   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
782   if (errorSTREAMkernel) {
783     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
784     exit(-2000);
785   }
786 
787   /* Initialize memory on the device */
788   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
789   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
790   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
791 
792   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
793   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
794   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
795   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
796   if (errorSTREAMkernel) {
797     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
798     exit(-3000);
799   }
800 
801   /* Initialize memory on the device */
802   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
803   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
804 
805   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
806   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
807   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
808   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
809   if (errorSTREAMkernel) {
810     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
811     exit(-4000);
812   }
813 
814   /* Initialize memory on the device */
815   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
816   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
817   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
818 
819   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
820   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
821   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
822   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
823   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
824   if (errorSTREAMkernel) {
825     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
826     exit(-5000);
827   }
828 
829   /* Initialize memory on the device */
830   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
831   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
832   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
833 
834   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
835   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
836   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
837   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
838   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
839   if (errorSTREAMkernel) {
840     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
841     exit(-6000);
842   }
843 
844   free(h_a);
845   free(h_b);
846   free(h_c);
847   }
848   /* continue from here */
849   printResultsReadable(times,sizeof(double));
850 
851   /* Free memory on device */
852   PetscCallCUDA(cudaFree(d_a));
853   PetscCallCUDA(cudaFree(d_b));
854   PetscCallCUDA(cudaFree(d_c));
855 
856   PetscFunctionReturn(0);
857 }
858 
859 ///////////////////////////////////////////////////////////////////////////
860 //Print Results to Screen and File
861 ///////////////////////////////////////////////////////////////////////////
862 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
863 {
864   PetscErrorCode ierr;
865   PetscInt       j, k;
866   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
867   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
868   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
869   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
870   const float    bytes_per_kernel[8] = {
871     2. * bsize * N,
872     2. * bsize * N,
873     2. * bsize * N,
874     2. * bsize * N,
875     3. * bsize * N,
876     3. * bsize * N,
877     3. * bsize * N,
878     3. * bsize * N
879   };
880   double         rate,irate;
881   int            rank,size;
882   PetscFunctionBegin;
883   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
884   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
885   /* --- SUMMARY --- */
886   for (k = 0; k < NTIMES; ++k) {
887     for (j = 0; j < 8; ++j) {
888       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
889       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
890       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
891     }
892   }
893   for (j = 0; j < 8; ++j) {
894     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
895   }
896   j = 7;
897   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
898   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
899   if (rank == 0) {
900     FILE *fd;
901     if (size == 1) {
902       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
903       fd = fopen("flops","w");
904       fprintf(fd,"%g\n",rate);
905       fclose(fd);
906     } else {
907       double prate;
908       fd = fopen("flops","r");
909       fscanf(fd,"%lg",&prate);
910       fclose(fd);
911       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
912     }
913   }
914 
915   PetscFunctionReturn(0);
916 }
917