xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision a69119a591a03a9d906b29c0a4e9802e4d7c9795)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice.h>
28 
29 #define N        10000000
30 #define NTIMES   10
31 
32 # ifndef MIN
33 # define MIN(x,y) ((x)<(y) ? (x) : (y))
34 # endif
35 # ifndef MAX
36 # define MAX(x,y) ((x)>(y) ? (x) : (y))
37 # endif
38 
39 const float  flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41 
42 __global__ void set_array(float *a,  float value, size_t len)
43 {
44   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45   while (idx < len) {
46     a[idx] = value;
47     idx   += blockDim.x * gridDim.x;
48   }
49 }
50 
51 __global__ void set_array_double(double *a,  double value, size_t len)
52 {
53   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54   while (idx < len) {
55     a[idx] = value;
56     idx   += blockDim.x * gridDim.x;
57   }
58 }
59 
60 __global__ void STREAM_Copy(float *a, float *b, size_t len)
61 {
62   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63   while (idx < len) {
64     b[idx] = a[idx];
65     idx   += blockDim.x * gridDim.x;
66   }
67 }
68 
69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70 {
71   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72   while (idx < len) {
73     b[idx] = a[idx];
74     idx   += blockDim.x * gridDim.x;
75   }
76 }
77 
78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79 {
80   /*
81    * Ensure size of thread index space is as large as or greater than
82    * vector index space else return.
83    */
84   if (blockDim.x * gridDim.x < len) return;
85   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86   if (idx < len) b[idx] = a[idx];
87 }
88 
89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90 {
91   /*
92    * Ensure size of thread index space is as large as or greater than
93    * vector index space else return.
94    */
95   if (blockDim.x * gridDim.x < len) return;
96   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97   if (idx < len) b[idx] = a[idx];
98 }
99 
100 __global__ void STREAM_Scale(float *a, float *b, float scale,  size_t len)
101 {
102   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103   while (idx < len) {
104     b[idx] = scale* a[idx];
105     idx   += blockDim.x * gridDim.x;
106   }
107 }
108 
109 __global__ void STREAM_Scale_double(double *a, double *b, double scale,  size_t len)
110 {
111   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112   while (idx < len) {
113     b[idx] = scale* a[idx];
114     idx   += blockDim.x * gridDim.x;
115   }
116 }
117 
118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale,  size_t len)
119 {
120   /*
121    * Ensure size of thread index space is as large as or greater than
122    * vector index space else return.
123    */
124   if (blockDim.x * gridDim.x < len) return;
125   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126   if (idx < len) b[idx] = scale* a[idx];
127 }
128 
129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale,  size_t len)
130 {
131   /*
132    * Ensure size of thread index space is as large as or greater than
133    * vector index space else return.
134    */
135   if (blockDim.x * gridDim.x < len) return;
136   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137   if (idx < len) b[idx] = scale* a[idx];
138 }
139 
140 __global__ void STREAM_Add(float *a, float *b, float *c,  size_t len)
141 {
142   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143   while (idx < len) {
144     c[idx] = a[idx]+b[idx];
145     idx   += blockDim.x * gridDim.x;
146   }
147 }
148 
149 __global__ void STREAM_Add_double(double *a, double *b, double *c,  size_t len)
150 {
151   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152   while (idx < len) {
153     c[idx] = a[idx]+b[idx];
154     idx   += blockDim.x * gridDim.x;
155   }
156 }
157 
158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c,  size_t len)
159 {
160   /*
161    * Ensure size of thread index space is as large as or greater than
162    * vector index space else return.
163    */
164   if (blockDim.x * gridDim.x < len) return;
165   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166   if (idx < len) c[idx] = a[idx]+b[idx];
167 }
168 
169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c,  size_t len)
170 {
171   /*
172    * Ensure size of thread index space is as large as or greater than
173    * vector index space else return.
174    */
175   if (blockDim.x * gridDim.x < len) return;
176   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177   if (idx < len) c[idx] = a[idx]+b[idx];
178 }
179 
180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181 {
182   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183   while (idx < len) {
184     c[idx] = a[idx]+scalar*b[idx];
185     idx   += blockDim.x * gridDim.x;
186   }
187 }
188 
189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190 {
191   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192   while (idx < len) {
193     c[idx] = a[idx]+scalar*b[idx];
194     idx   += blockDim.x * gridDim.x;
195   }
196 }
197 
198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199 {
200   /*
201    * Ensure size of thread index space is as large as or greater than
202    * vector index space else return.
203    */
204   if (blockDim.x * gridDim.x < len) return;
205   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
207 }
208 
209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210 {
211   /*
212    * Ensure size of thread index space is as large as or greater than
213    * vector index space else return.
214    */
215   if (blockDim.x * gridDim.x < len) return;
216   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217   if (idx < len) c[idx] = a[idx]+scalar*b[idx];
218 }
219 
220 /* Host side verification routines */
221 bool STREAM_Copy_verify(float *a, float *b, size_t len)
222 {
223   size_t idx;
224   bool   bDifferent = false;
225 
226   for (idx = 0; idx < len && !bDifferent; idx++) {
227     float expectedResult     = a[idx];
228     float diffResultExpected = (b[idx] - expectedResult);
229     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
230     /* element-wise relative error determination */
231     bDifferent = (relErrorULPS > 2.f);
232   }
233 
234   return bDifferent;
235 }
236 
237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238 {
239   size_t idx;
240   bool   bDifferent = false;
241 
242   for (idx = 0; idx < len && !bDifferent; idx++) {
243     double expectedResult     = a[idx];
244     double diffResultExpected = (b[idx] - expectedResult);
245     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps;
246     /* element-wise relative error determination */
247     bDifferent = (relErrorULPS > 2.);
248   }
249 
250   return bDifferent;
251 }
252 
253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254 {
255   size_t idx;
256   bool   bDifferent = false;
257 
258   for (idx = 0; idx < len && !bDifferent; idx++) {
259     float expectedResult     = scale*a[idx];
260     float diffResultExpected = (b[idx] - expectedResult);
261     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
262     /* element-wise relative error determination */
263     bDifferent = (relErrorULPS > 2.f);
264   }
265 
266   return bDifferent;
267 }
268 
269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270 {
271   size_t idx;
272   bool   bDifferent = false;
273 
274   for (idx = 0; idx < len && !bDifferent; idx++) {
275     double expectedResult     = scale*a[idx];
276     double diffResultExpected = (b[idx] - expectedResult);
277     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
278     /* element-wise relative error determination */
279     bDifferent = (relErrorULPS > 2.);
280   }
281 
282   return bDifferent;
283 }
284 
285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286 {
287   size_t idx;
288   bool   bDifferent = false;
289 
290   for (idx = 0; idx < len && !bDifferent; idx++) {
291     float expectedResult     = a[idx] + b[idx];
292     float diffResultExpected = (c[idx] - expectedResult);
293     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
294     /* element-wise relative error determination */
295     bDifferent = (relErrorULPS > 2.f);
296   }
297 
298   return bDifferent;
299 }
300 
301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302 {
303   size_t idx;
304   bool   bDifferent = false;
305 
306   for (idx = 0; idx < len && !bDifferent; idx++) {
307     double expectedResult     = a[idx] + b[idx];
308     double diffResultExpected = (c[idx] - expectedResult);
309     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
310     /* element-wise relative error determination */
311     bDifferent = (relErrorULPS > 2.);
312   }
313 
314   return bDifferent;
315 }
316 
317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318 {
319   size_t idx;
320   bool   bDifferent = false;
321 
322   for (idx = 0; idx < len && !bDifferent; idx++) {
323     float expectedResult     = a[idx] + scalar*b[idx];
324     float diffResultExpected = (c[idx] - expectedResult);
325     float relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
326     /* element-wise relative error determination */
327     bDifferent = (relErrorULPS > 3.f);
328   }
329 
330   return bDifferent;
331 }
332 
333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334 {
335   size_t idx;
336   bool   bDifferent = false;
337 
338   for (idx = 0; idx < len && !bDifferent; idx++) {
339     double expectedResult     = a[idx] + scalar*b[idx];
340     double diffResultExpected = (c[idx] - expectedResult);
341     double relErrorULPS       = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps;
342     /* element-wise relative error determination */
343     bDifferent = (relErrorULPS > 3.);
344   }
345 
346   return bDifferent;
347 }
348 
349 /* forward declarations */
350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354 
355 int main(int argc, char *argv[])
356 {
357   PetscInt       device    = 0;
358   PetscBool      runDouble = PETSC_TRUE;
359   const PetscBool cpuTiming = PETSC_TRUE; // must be true
360   PetscErrorCode ierr;
361 
362   PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
363 
364   PetscCall(PetscInitialize(&argc, &argv, 0, help));
365 
366   PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
367   PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0));
368   PetscCall(PetscOptionsBool("-double",    "Also run double precision tests",   "STREAM", runDouble, &runDouble, NULL));
369   PetscOptionsEnd();
370 
371   ierr = setupStream(device, runDouble, cpuTiming);
372   if (ierr) {
373     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
374   }
375   PetscCall(PetscFinalize());
376   return 0;
377 }
378 
379 ///////////////////////////////////////////////////////////////////////////////
380 //Run the appropriate tests
381 ///////////////////////////////////////////////////////////////////////////////
382 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
383 {
384   PetscInt       iNumThreadsPerBlock = 128;
385 
386   PetscFunctionBegin;
387   // Check device
388   {
389     int deviceCount;
390 
391     cudaGetDeviceCount(&deviceCount);
392     if (deviceCount == 0) {
393       PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
394       return -1000;
395     }
396 
397     if (deviceNum >= deviceCount || deviceNum < 0) {
398       PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
399       deviceNum = 0;
400     }
401   }
402 
403   cudaSetDevice(deviceNum);
404   // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
405   cudaDeviceProp deviceProp;
406   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
407     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
408     return -1;
409   }
410 
411   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
412     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
413     return -1;
414   }
415   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
416   else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
417 
418   if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
419   else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
420   PetscFunctionReturn(0);
421 }
422 
423 ///////////////////////////////////////////////////////////////////////////
424 // runStream
425 ///////////////////////////////////////////////////////////////////////////
426 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
427 {
428   float          *d_a, *d_b, *d_c;
429   int            k;
430   float          times[8][NTIMES];
431   float          scalar;
432 
433   PetscFunctionBegin;
434   /* Allocate memory on device */
435   PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(float)*N));
436   PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(float)*N));
437   PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(float)*N));
438 
439   /* Compute execution configuration */
440 
441   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
442   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
443   if (N % dimBlock.x != 0) dimGrid.x+=1;
444 
445   /* Initialize memory on the device */
446   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
447   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
448   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
449 
450   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
451   PetscLogDouble cpuTimer = 0.0;
452 
453   scalar=3.0f;
454   for (k = 0; k < NTIMES; ++k) {
455     PetscTimeSubtract(&cpuTimer);
456     STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
457     cudaStreamSynchronize(NULL);
458     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
459     PetscTimeAdd(&cpuTimer);
460     if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec
461 
462     cpuTimer = 0.0;
463     PetscTimeSubtract(&cpuTimer);
464     STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
465     cudaStreamSynchronize(NULL);
466     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
467     //get the total elapsed time in ms
468     PetscTimeAdd(&cpuTimer);
469     if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3;
470 
471     cpuTimer = 0.0;
472     PetscTimeSubtract(&cpuTimer);
473     STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
474     cudaStreamSynchronize(NULL);
475     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
476     //get the total elapsed time in ms
477     PetscTimeAdd(&cpuTimer);
478     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
479 
480     cpuTimer = 0.0;
481     PetscTimeSubtract(&cpuTimer);
482     STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
483     cudaStreamSynchronize(NULL);
484     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
485     //get the total elapsed time in ms
486     PetscTimeAdd(&cpuTimer);
487     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
488 
489     cpuTimer = 0.0;
490     PetscTimeSubtract(&cpuTimer);
491     // PetscCallCUDA(cudaEventRecord(start, 0));
492     STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
493     cudaStreamSynchronize(NULL);
494     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
495     PetscCallCUDA(cudaEventRecord(stop, 0));
496     // PetscCallCUDA(cudaEventSynchronize(stop));
497     //get the total elapsed time in ms
498     PetscTimeAdd(&cpuTimer);
499     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
500     else {
501       // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
502     }
503 
504     cpuTimer = 0.0;
505     PetscTimeSubtract(&cpuTimer);
506     STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
507     cudaStreamSynchronize(NULL);
508     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
509     //get the total elapsed time in ms
510     PetscTimeAdd(&cpuTimer);
511     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
512 
513     cpuTimer = 0.0;
514     PetscTimeSubtract(&cpuTimer);
515     STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
516     cudaStreamSynchronize(NULL);
517     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
518     //get the total elapsed time in ms
519     PetscTimeAdd(&cpuTimer);
520     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
521 
522     cpuTimer = 0.0;
523     PetscTimeSubtract(&cpuTimer);
524     STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
525     cudaStreamSynchronize(NULL);
526     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
527     //get the total elapsed time in ms
528     PetscTimeAdd(&cpuTimer);
529     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
530   }
531 
532   if (1) { /* verify kernels */
533   float *h_a, *h_b, *h_c;
534   bool  errorSTREAMkernel = true;
535 
536   if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
537     printf("Unable to allocate array h_a, exiting ...\n");
538     exit(1);
539   }
540   if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
541     printf("Unable to allocate array h_b, exiting ...\n");
542     exit(1);
543   }
544 
545   if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) {
546     printf("Unalbe to allocate array h_c, exiting ...\n");
547     exit(1);
548   }
549 
550   /*
551    * perform kernel, copy device memory into host memory and verify each
552    * device kernel output
553    */
554 
555   /* Initialize memory on the device */
556   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
557   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
558   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
559 
560   STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N);
561   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
562   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
563   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
564   if (errorSTREAMkernel) {
565     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
566     exit(-2000);
567   }
568 
569   /* Initialize memory on the device */
570   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
571   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
572   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
573 
574   STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N);
575   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
576   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
577   errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
578   if (errorSTREAMkernel) {
579     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
580     exit(-3000);
581   }
582 
583   /* Initialize memory on the device */
584   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
585   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
586   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
587 
588   STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
589   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
590   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
591   errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
592   if (errorSTREAMkernel) {
593     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
594     exit(-4000);
595   }
596 
597   /* Initialize memory on the device */
598   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
599   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
600   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
601 
602   STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
603   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
604   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
605   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
606   errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
607   if (errorSTREAMkernel) {
608     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
609     exit(-5000);
610   }
611 
612   /* Initialize memory on the device */
613   set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N);
614   set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N);
615   set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N);
616 
617   STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
618   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
619   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
620   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
621   errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
622   if (errorSTREAMkernel) {
623     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
624     exit(-6000);
625   }
626 
627   free(h_a);
628   free(h_b);
629   free(h_c);
630   }
631   /* continue from here */
632   printResultsReadable(times, sizeof(float));
633 
634   /* Free memory on device */
635   PetscCallCUDA(cudaFree(d_a));
636   PetscCallCUDA(cudaFree(d_b));
637   PetscCallCUDA(cudaFree(d_c));
638 
639   PetscFunctionReturn(0);
640 }
641 
642 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
643 {
644   double         *d_a, *d_b, *d_c;
645   int            k;
646   float          times[8][NTIMES];
647   double         scalar;
648 
649   PetscFunctionBegin;
650   /* Allocate memory on device */
651   PetscCallCUDA(cudaMalloc((void**)&d_a, sizeof(double)*N));
652   PetscCallCUDA(cudaMalloc((void**)&d_b, sizeof(double)*N));
653   PetscCallCUDA(cudaMalloc((void**)&d_c, sizeof(double)*N));
654 
655   /* Compute execution configuration */
656 
657   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
658   dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */
659   if (N % dimBlock.x != 0) dimGrid.x+=1;
660 
661   /* Initialize memory on the device */
662   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
663   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
664   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
665 
666   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
667   PetscLogDouble cpuTimer = 0.0;
668 
669   scalar=3.0;
670   for (k = 0; k < NTIMES; ++k) {
671     PetscTimeSubtract(&cpuTimer);
672     STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
673     cudaStreamSynchronize(NULL);
674     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
675     //get the total elapsed time in ms
676     if (bDontUseGPUTiming) {
677       PetscTimeAdd(&cpuTimer);
678       times[0][k] = cpuTimer*1.e3;
679     }
680 
681     cpuTimer = 0.0;
682     PetscTimeSubtract(&cpuTimer);
683     STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
684     cudaStreamSynchronize(NULL);
685     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
686     //get the total elapsed time in ms
687     if (bDontUseGPUTiming) {
688       PetscTimeAdd(&cpuTimer);
689       times[1][k] = cpuTimer*1.e3;
690     }
691 
692     cpuTimer = 0.0;
693     PetscTimeSubtract(&cpuTimer);
694     STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
695     cudaStreamSynchronize(NULL);
696     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
697     //get the total elapsed time in ms
698     PetscTimeAdd(&cpuTimer);
699     if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3;
700 
701     cpuTimer = 0.0;
702     PetscTimeSubtract(&cpuTimer);
703     STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar,  N);
704     cudaStreamSynchronize(NULL);
705     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
706     //get the total elapsed time in ms
707     PetscTimeAdd(&cpuTimer);
708     if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3;
709 
710     cpuTimer = 0.0;
711     PetscTimeSubtract(&cpuTimer);
712     STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
713     cudaStreamSynchronize(NULL);
714     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
715     //get the total elapsed time in ms
716     PetscTimeAdd(&cpuTimer);
717     if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3;
718 
719     cpuTimer = 0.0;
720     PetscTimeSubtract(&cpuTimer);
721     STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c,  N);
722     cudaStreamSynchronize(NULL);
723     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
724     //get the total elapsed time in ms
725     PetscTimeAdd(&cpuTimer);
726     if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3;
727 
728     cpuTimer = 0.0;
729     PetscTimeSubtract(&cpuTimer);
730     STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
731     cudaStreamSynchronize(NULL);
732     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
733     //get the total elapsed time in ms
734     PetscTimeAdd(&cpuTimer);
735     if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3;
736 
737     cpuTimer = 0.0;
738     PetscTimeSubtract(&cpuTimer);
739     STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar,  N);
740     cudaStreamSynchronize(NULL);
741     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
742     //get the total elapsed time in ms
743     PetscTimeAdd(&cpuTimer);
744     if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3;
745   }
746 
747   if (1) { /* verify kernels */
748   double *h_a, *h_b, *h_c;
749   bool   errorSTREAMkernel = true;
750 
751   if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
752     printf("Unable to allocate array h_a, exiting ...\n");
753     exit(1);
754   }
755   if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
756     printf("Unable to allocate array h_b, exiting ...\n");
757     exit(1);
758   }
759 
760   if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) {
761     printf("Unalbe to allocate array h_c, exiting ...\n");
762     exit(1);
763   }
764 
765   /*
766    * perform kernel, copy device memory into host memory and verify each
767    * device kernel output
768    */
769 
770   /* Initialize memory on the device */
771   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
772   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
773   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
774 
775   STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
776   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
777   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
778   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
779   if (errorSTREAMkernel) {
780     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
781     exit(-2000);
782   }
783 
784   /* Initialize memory on the device */
785   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
786   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
787   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
788 
789   STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N);
790   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
791   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
792   errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
793   if (errorSTREAMkernel) {
794     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
795     exit(-3000);
796   }
797 
798   /* Initialize memory on the device */
799   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
800   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
801 
802   STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N);
803   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
804   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
805   errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
806   if (errorSTREAMkernel) {
807     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
808     exit(-4000);
809   }
810 
811   /* Initialize memory on the device */
812   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
813   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
814   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
815 
816   STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N);
817   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
818   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
819   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
820   errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
821   if (errorSTREAMkernel) {
822     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
823     exit(-5000);
824   }
825 
826   /* Initialize memory on the device */
827   set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N);
828   set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N);
829   set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N);
830 
831   STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N);
832   PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
833   PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
834   PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
835   errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
836   if (errorSTREAMkernel) {
837     PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
838     exit(-6000);
839   }
840 
841   free(h_a);
842   free(h_b);
843   free(h_c);
844   }
845   /* continue from here */
846   printResultsReadable(times,sizeof(double));
847 
848   /* Free memory on device */
849   PetscCallCUDA(cudaFree(d_a));
850   PetscCallCUDA(cudaFree(d_b));
851   PetscCallCUDA(cudaFree(d_c));
852 
853   PetscFunctionReturn(0);
854 }
855 
856 ///////////////////////////////////////////////////////////////////////////
857 //Print Results to Screen and File
858 ///////////////////////////////////////////////////////////////////////////
859 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
860 {
861   PetscErrorCode ierr;
862   PetscInt       j, k;
863   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
864   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
865   float          mintime[8]          = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30};
866   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
867   const float    bytes_per_kernel[8] = {
868     2. * bsize * N,
869     2. * bsize * N,
870     2. * bsize * N,
871     2. * bsize * N,
872     3. * bsize * N,
873     3. * bsize * N,
874     3. * bsize * N,
875     3. * bsize * N
876   };
877   double         rate,irate;
878   int            rank,size;
879   PetscFunctionBegin;
880   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD,&rank));
881   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD,&size));
882   /* --- SUMMARY --- */
883   for (k = 0; k < NTIMES; ++k) {
884     for (j = 0; j < 8; ++j) {
885       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
886       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
887       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
888     }
889   }
890   for (j = 0; j < 8; ++j) {
891     avgtime[j] = avgtime[j]/(float)(NTIMES-1);
892   }
893   j = 7;
894   irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j];
895   ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
896   if (rank == 0) {
897     FILE *fd;
898     if (size == 1) {
899       printf("%d %11.4f   Rate (MB/s)\n",size, rate);
900       fd = fopen("flops","w");
901       fprintf(fd,"%g\n",rate);
902       fclose(fd);
903     } else {
904       double prate;
905       fd = fopen("flops","r");
906       fscanf(fd,"%lg",&prate);
907       fclose(fd);
908       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate/prate);
909     }
910   }
911 
912   PetscFunctionReturn(0);
913 }
914