xref: /petsc/src/benchmarks/streams/CUDAVersion.cu (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 /*
2   STREAM benchmark implementation in CUDA.
3 
4     COPY:       a(i) = b(i)
5     SCALE:      a(i) = q*b(i)
6     SUM:        a(i) = b(i) + c(i)
7     TRIAD:      a(i) = b(i) + q*c(i)
8 
9   It measures the memory system on the device.
10   The implementation is in double precision with a single option.
11 
12   Code based on the code developed by John D. McCalpin
13   http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14 
15   Written by: Massimiliano Fatica, NVIDIA Corporation
16   Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17   Extensive Revisions, 4 December 2010
18   Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19 
20   User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23 
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice_cuda.h>
28 
29 #define N      10000000
30 #define NTIMES 10
31 
32 #ifndef MIN
33 #define MIN(x, y) ((x) < (y) ? (x) : (y))
34 #endif
35 #ifndef MAX
36 #define MAX(x, y) ((x) > (y) ? (x) : (y))
37 #endif
38 
39 const float  flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41 
42 __global__ void set_array(float *a, float value, size_t len) {
43   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
44   while (idx < len) {
45     a[idx] = value;
46     idx += blockDim.x * gridDim.x;
47   }
48 }
49 
50 __global__ void set_array_double(double *a, double value, size_t len) {
51   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
52   while (idx < len) {
53     a[idx] = value;
54     idx += blockDim.x * gridDim.x;
55   }
56 }
57 
58 __global__ void STREAM_Copy(float *a, float *b, size_t len) {
59   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
60   while (idx < len) {
61     b[idx] = a[idx];
62     idx += blockDim.x * gridDim.x;
63   }
64 }
65 
66 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) {
67   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
68   while (idx < len) {
69     b[idx] = a[idx];
70     idx += blockDim.x * gridDim.x;
71   }
72 }
73 
74 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) {
75   /*
76    * Ensure size of thread index space is as large as or greater than
77    * vector index space else return.
78    */
79   if (blockDim.x * gridDim.x < len) return;
80   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
81   if (idx < len) b[idx] = a[idx];
82 }
83 
84 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) {
85   /*
86    * Ensure size of thread index space is as large as or greater than
87    * vector index space else return.
88    */
89   if (blockDim.x * gridDim.x < len) return;
90   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
91   if (idx < len) b[idx] = a[idx];
92 }
93 
94 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) {
95   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
96   while (idx < len) {
97     b[idx] = scale * a[idx];
98     idx += blockDim.x * gridDim.x;
99   }
100 }
101 
102 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) {
103   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
104   while (idx < len) {
105     b[idx] = scale * a[idx];
106     idx += blockDim.x * gridDim.x;
107   }
108 }
109 
110 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) {
111   /*
112    * Ensure size of thread index space is as large as or greater than
113    * vector index space else return.
114    */
115   if (blockDim.x * gridDim.x < len) return;
116   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
117   if (idx < len) b[idx] = scale * a[idx];
118 }
119 
120 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) {
121   /*
122    * Ensure size of thread index space is as large as or greater than
123    * vector index space else return.
124    */
125   if (blockDim.x * gridDim.x < len) return;
126   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
127   if (idx < len) b[idx] = scale * a[idx];
128 }
129 
130 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) {
131   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
132   while (idx < len) {
133     c[idx] = a[idx] + b[idx];
134     idx += blockDim.x * gridDim.x;
135   }
136 }
137 
138 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) {
139   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
140   while (idx < len) {
141     c[idx] = a[idx] + b[idx];
142     idx += blockDim.x * gridDim.x;
143   }
144 }
145 
146 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) {
147   /*
148    * Ensure size of thread index space is as large as or greater than
149    * vector index space else return.
150    */
151   if (blockDim.x * gridDim.x < len) return;
152   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
153   if (idx < len) c[idx] = a[idx] + b[idx];
154 }
155 
156 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) {
157   /*
158    * Ensure size of thread index space is as large as or greater than
159    * vector index space else return.
160    */
161   if (blockDim.x * gridDim.x < len) return;
162   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
163   if (idx < len) c[idx] = a[idx] + b[idx];
164 }
165 
166 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) {
167   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
168   while (idx < len) {
169     c[idx] = a[idx] + scalar * b[idx];
170     idx += blockDim.x * gridDim.x;
171   }
172 }
173 
174 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) {
175   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
176   while (idx < len) {
177     c[idx] = a[idx] + scalar * b[idx];
178     idx += blockDim.x * gridDim.x;
179   }
180 }
181 
182 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) {
183   /*
184    * Ensure size of thread index space is as large as or greater than
185    * vector index space else return.
186    */
187   if (blockDim.x * gridDim.x < len) return;
188   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
189   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
190 }
191 
192 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) {
193   /*
194    * Ensure size of thread index space is as large as or greater than
195    * vector index space else return.
196    */
197   if (blockDim.x * gridDim.x < len) return;
198   size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
199   if (idx < len) c[idx] = a[idx] + scalar * b[idx];
200 }
201 
202 /* Host side verification routines */
203 bool STREAM_Copy_verify(float *a, float *b, size_t len) {
204   size_t idx;
205   bool   bDifferent = false;
206 
207   for (idx = 0; idx < len && !bDifferent; idx++) {
208     float expectedResult     = a[idx];
209     float diffResultExpected = (b[idx] - expectedResult);
210     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
211     /* element-wise relative error determination */
212     bDifferent               = (relErrorULPS > 2.f);
213   }
214 
215   return bDifferent;
216 }
217 
218 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) {
219   size_t idx;
220   bool   bDifferent = false;
221 
222   for (idx = 0; idx < len && !bDifferent; idx++) {
223     double expectedResult     = a[idx];
224     double diffResultExpected = (b[idx] - expectedResult);
225     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps;
226     /* element-wise relative error determination */
227     bDifferent                = (relErrorULPS > 2.);
228   }
229 
230   return bDifferent;
231 }
232 
233 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) {
234   size_t idx;
235   bool   bDifferent = false;
236 
237   for (idx = 0; idx < len && !bDifferent; idx++) {
238     float expectedResult     = scale * a[idx];
239     float diffResultExpected = (b[idx] - expectedResult);
240     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
241     /* element-wise relative error determination */
242     bDifferent               = (relErrorULPS > 2.f);
243   }
244 
245   return bDifferent;
246 }
247 
248 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) {
249   size_t idx;
250   bool   bDifferent = false;
251 
252   for (idx = 0; idx < len && !bDifferent; idx++) {
253     double expectedResult     = scale * a[idx];
254     double diffResultExpected = (b[idx] - expectedResult);
255     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
256     /* element-wise relative error determination */
257     bDifferent                = (relErrorULPS > 2.);
258   }
259 
260   return bDifferent;
261 }
262 
263 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) {
264   size_t idx;
265   bool   bDifferent = false;
266 
267   for (idx = 0; idx < len && !bDifferent; idx++) {
268     float expectedResult     = a[idx] + b[idx];
269     float diffResultExpected = (c[idx] - expectedResult);
270     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
271     /* element-wise relative error determination */
272     bDifferent               = (relErrorULPS > 2.f);
273   }
274 
275   return bDifferent;
276 }
277 
278 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) {
279   size_t idx;
280   bool   bDifferent = false;
281 
282   for (idx = 0; idx < len && !bDifferent; idx++) {
283     double expectedResult     = a[idx] + b[idx];
284     double diffResultExpected = (c[idx] - expectedResult);
285     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
286     /* element-wise relative error determination */
287     bDifferent                = (relErrorULPS > 2.);
288   }
289 
290   return bDifferent;
291 }
292 
293 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) {
294   size_t idx;
295   bool   bDifferent = false;
296 
297   for (idx = 0; idx < len && !bDifferent; idx++) {
298     float expectedResult     = a[idx] + scalar * b[idx];
299     float diffResultExpected = (c[idx] - expectedResult);
300     float relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
301     /* element-wise relative error determination */
302     bDifferent               = (relErrorULPS > 3.f);
303   }
304 
305   return bDifferent;
306 }
307 
308 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) {
309   size_t idx;
310   bool   bDifferent = false;
311 
312   for (idx = 0; idx < len && !bDifferent; idx++) {
313     double expectedResult     = a[idx] + scalar * b[idx];
314     double diffResultExpected = (c[idx] - expectedResult);
315     double relErrorULPS       = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
316     /* element-wise relative error determination */
317     bDifferent                = (relErrorULPS > 3.);
318   }
319 
320   return bDifferent;
321 }
322 
323 /* forward declarations */
324 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
325 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
326 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
327 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
328 
329 int main(int argc, char *argv[]) {
330   PetscInt        device    = 0;
331   PetscBool       runDouble = PETSC_TRUE;
332   const PetscBool cpuTiming = PETSC_TRUE; // must be true
333   PetscErrorCode  ierr;
334 
335   PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
336 
337   PetscCall(PetscInitialize(&argc, &argv, 0, help));
338 
339   PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
340   PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0));
341   PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL));
342   PetscOptionsEnd();
343 
344   ierr = setupStream(device, runDouble, cpuTiming);
345   if (ierr) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); }
346   PetscCall(PetscFinalize());
347   return 0;
348 }
349 
350 ///////////////////////////////////////////////////////////////////////////////
351 //Run the appropriate tests
352 ///////////////////////////////////////////////////////////////////////////////
353 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) {
354   PetscInt iNumThreadsPerBlock = 128;
355 
356   PetscFunctionBegin;
357   // Check device
358   {
359     int deviceCount;
360 
361     cudaGetDeviceCount(&deviceCount);
362     if (deviceCount == 0) {
363       PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
364       return -1000;
365     }
366 
367     if (deviceNum >= deviceCount || deviceNum < 0) {
368       PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
369       deviceNum = 0;
370     }
371   }
372 
373   cudaSetDevice(deviceNum);
374   // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
375   cudaDeviceProp deviceProp;
376   if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
377     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
378     return -1;
379   }
380 
381   if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
382     PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
383     return -1;
384   }
385   if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
386   else iNumThreadsPerBlock = 128;                                                /* GF100 architecture / 32 CUDA Cores per MP */
387 
388   if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
389   else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
390   PetscFunctionReturn(PETSC_SUCCESS);
391 }
392 
393 ///////////////////////////////////////////////////////////////////////////
394 // runStream
395 ///////////////////////////////////////////////////////////////////////////
396 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) {
397   float *d_a, *d_b, *d_c;
398   int    k;
399   float  times[8][NTIMES];
400   float  scalar;
401 
402   PetscFunctionBegin;
403   /* Allocate memory on device */
404   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N));
405   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N));
406   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N));
407 
408   /* Compute execution configuration */
409 
410   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
411   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
412   if (N % dimBlock.x != 0) dimGrid.x += 1;
413 
414   /* Initialize memory on the device */
415   set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
416   set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
417   set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
418 
419   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
420   PetscLogDouble cpuTimer = 0.0;
421 
422   scalar = 3.0f;
423   for (k = 0; k < NTIMES; ++k) {
424     PetscTimeSubtract(&cpuTimer);
425     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
426     cudaStreamSynchronize(NULL);
427     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
428     PetscTimeAdd(&cpuTimer);
429     if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec
430 
431     cpuTimer = 0.0;
432     PetscTimeSubtract(&cpuTimer);
433     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
434     cudaStreamSynchronize(NULL);
435     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
436     //get the total elapsed time in ms
437     PetscTimeAdd(&cpuTimer);
438     if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3;
439 
440     cpuTimer = 0.0;
441     PetscTimeSubtract(&cpuTimer);
442     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
443     cudaStreamSynchronize(NULL);
444     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
445     //get the total elapsed time in ms
446     PetscTimeAdd(&cpuTimer);
447     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
448 
449     cpuTimer = 0.0;
450     PetscTimeSubtract(&cpuTimer);
451     STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
452     cudaStreamSynchronize(NULL);
453     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
454     //get the total elapsed time in ms
455     PetscTimeAdd(&cpuTimer);
456     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
457 
458     cpuTimer = 0.0;
459     PetscTimeSubtract(&cpuTimer);
460     // PetscCallCUDA(cudaEventRecord(start, 0));
461     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
462     cudaStreamSynchronize(NULL);
463     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
464     PetscCallCUDA(cudaEventRecord(stop, 0));
465     // PetscCallCUDA(cudaEventSynchronize(stop));
466     //get the total elapsed time in ms
467     PetscTimeAdd(&cpuTimer);
468     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
469     else {
470       // PetscCallCUDA(cudaEventElapsedTime(&times[4][k], start, stop));
471     }
472 
473     cpuTimer = 0.0;
474     PetscTimeSubtract(&cpuTimer);
475     STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
476     cudaStreamSynchronize(NULL);
477     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
478     //get the total elapsed time in ms
479     PetscTimeAdd(&cpuTimer);
480     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
481 
482     cpuTimer = 0.0;
483     PetscTimeSubtract(&cpuTimer);
484     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
485     cudaStreamSynchronize(NULL);
486     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
487     //get the total elapsed time in ms
488     PetscTimeAdd(&cpuTimer);
489     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
490 
491     cpuTimer = 0.0;
492     PetscTimeSubtract(&cpuTimer);
493     STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
494     cudaStreamSynchronize(NULL);
495     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
496     //get the total elapsed time in ms
497     PetscTimeAdd(&cpuTimer);
498     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
499   }
500 
501   if (1) { /* verify kernels */
502     float *h_a, *h_b, *h_c;
503     bool   errorSTREAMkernel = true;
504 
505     if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
506       printf("Unable to allocate array h_a, exiting ...\n");
507       exit(1);
508     }
509     if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
510       printf("Unable to allocate array h_b, exiting ...\n");
511       exit(1);
512     }
513 
514     if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
515       printf("Unalbe to allocate array h_c, exiting ...\n");
516       exit(1);
517     }
518 
519     /*
520    * perform kernel, copy device memory into host memory and verify each
521    * device kernel output
522    */
523 
524     /* Initialize memory on the device */
525     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
526     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
527     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
528 
529     STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
530     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
531     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
532     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
533     if (errorSTREAMkernel) {
534       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
535       exit(-2000);
536     }
537 
538     /* Initialize memory on the device */
539     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
540     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
541     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
542 
543     STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
544     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
545     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
546     errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
547     if (errorSTREAMkernel) {
548       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
549       exit(-3000);
550     }
551 
552     /* Initialize memory on the device */
553     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
554     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
555     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
556 
557     STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
558     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
559     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
560     errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
561     if (errorSTREAMkernel) {
562       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
563       exit(-4000);
564     }
565 
566     /* Initialize memory on the device */
567     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
568     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
569     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
570 
571     STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
572     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
573     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
574     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
575     errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
576     if (errorSTREAMkernel) {
577       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
578       exit(-5000);
579     }
580 
581     /* Initialize memory on the device */
582     set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
583     set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
584     set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
585 
586     STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
587     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
588     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
589     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
590     errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
591     if (errorSTREAMkernel) {
592       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
593       exit(-6000);
594     }
595 
596     free(h_a);
597     free(h_b);
598     free(h_c);
599   }
600   /* continue from here */
601   printResultsReadable(times, sizeof(float));
602 
603   /* Free memory on device */
604   PetscCallCUDA(cudaFree(d_a));
605   PetscCallCUDA(cudaFree(d_b));
606   PetscCallCUDA(cudaFree(d_c));
607 
608   PetscFunctionReturn(PETSC_SUCCESS);
609 }
610 
611 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) {
612   double *d_a, *d_b, *d_c;
613   int     k;
614   float   times[8][NTIMES];
615   double  scalar;
616 
617   PetscFunctionBegin;
618   /* Allocate memory on device */
619   PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N));
620   PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N));
621   PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N));
622 
623   /* Compute execution configuration */
624 
625   dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
626   dim3 dimGrid(N / dimBlock.x);       /* (N/dimBlock.x,1,1) */
627   if (N % dimBlock.x != 0) dimGrid.x += 1;
628 
629   /* Initialize memory on the device */
630   set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
631   set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
632   set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
633 
634   /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
635   PetscLogDouble cpuTimer = 0.0;
636 
637   scalar = 3.0;
638   for (k = 0; k < NTIMES; ++k) {
639     PetscTimeSubtract(&cpuTimer);
640     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
641     cudaStreamSynchronize(NULL);
642     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
643     //get the total elapsed time in ms
644     if (bDontUseGPUTiming) {
645       PetscTimeAdd(&cpuTimer);
646       times[0][k] = cpuTimer * 1.e3;
647     }
648 
649     cpuTimer = 0.0;
650     PetscTimeSubtract(&cpuTimer);
651     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
652     cudaStreamSynchronize(NULL);
653     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
654     //get the total elapsed time in ms
655     if (bDontUseGPUTiming) {
656       PetscTimeAdd(&cpuTimer);
657       times[1][k] = cpuTimer * 1.e3;
658     }
659 
660     cpuTimer = 0.0;
661     PetscTimeSubtract(&cpuTimer);
662     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
663     cudaStreamSynchronize(NULL);
664     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
665     //get the total elapsed time in ms
666     PetscTimeAdd(&cpuTimer);
667     if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
668 
669     cpuTimer = 0.0;
670     PetscTimeSubtract(&cpuTimer);
671     STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
672     cudaStreamSynchronize(NULL);
673     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
674     //get the total elapsed time in ms
675     PetscTimeAdd(&cpuTimer);
676     if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
677 
678     cpuTimer = 0.0;
679     PetscTimeSubtract(&cpuTimer);
680     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
681     cudaStreamSynchronize(NULL);
682     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
683     //get the total elapsed time in ms
684     PetscTimeAdd(&cpuTimer);
685     if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
686 
687     cpuTimer = 0.0;
688     PetscTimeSubtract(&cpuTimer);
689     STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
690     cudaStreamSynchronize(NULL);
691     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
692     //get the total elapsed time in ms
693     PetscTimeAdd(&cpuTimer);
694     if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
695 
696     cpuTimer = 0.0;
697     PetscTimeSubtract(&cpuTimer);
698     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
699     cudaStreamSynchronize(NULL);
700     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
701     //get the total elapsed time in ms
702     PetscTimeAdd(&cpuTimer);
703     if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
704 
705     cpuTimer = 0.0;
706     PetscTimeSubtract(&cpuTimer);
707     STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
708     cudaStreamSynchronize(NULL);
709     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
710     //get the total elapsed time in ms
711     PetscTimeAdd(&cpuTimer);
712     if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
713   }
714 
715   if (1) { /* verify kernels */
716     double *h_a, *h_b, *h_c;
717     bool    errorSTREAMkernel = true;
718 
719     if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
720       printf("Unable to allocate array h_a, exiting ...\n");
721       exit(1);
722     }
723     if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
724       printf("Unable to allocate array h_b, exiting ...\n");
725       exit(1);
726     }
727 
728     if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
729       printf("Unalbe to allocate array h_c, exiting ...\n");
730       exit(1);
731     }
732 
733     /*
734    * perform kernel, copy device memory into host memory and verify each
735    * device kernel output
736    */
737 
738     /* Initialize memory on the device */
739     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
740     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
741     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
742 
743     STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
744     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
745     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
746     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
747     if (errorSTREAMkernel) {
748       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
749       exit(-2000);
750     }
751 
752     /* Initialize memory on the device */
753     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
754     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
755     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
756 
757     STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
758     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
759     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
760     errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
761     if (errorSTREAMkernel) {
762       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
763       exit(-3000);
764     }
765 
766     /* Initialize memory on the device */
767     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
768     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
769 
770     STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
771     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
772     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
773     errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
774     if (errorSTREAMkernel) {
775       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
776       exit(-4000);
777     }
778 
779     /* Initialize memory on the device */
780     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
781     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
782     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
783 
784     STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
785     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
786     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
787     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
788     errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
789     if (errorSTREAMkernel) {
790       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
791       exit(-5000);
792     }
793 
794     /* Initialize memory on the device */
795     set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
796     set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
797     set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
798 
799     STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
800     PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
801     PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
802     PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
803     errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
804     if (errorSTREAMkernel) {
805       PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
806       exit(-6000);
807     }
808 
809     free(h_a);
810     free(h_b);
811     free(h_c);
812   }
813   /* continue from here */
814   printResultsReadable(times, sizeof(double));
815 
816   /* Free memory on device */
817   PetscCallCUDA(cudaFree(d_a));
818   PetscCallCUDA(cudaFree(d_b));
819   PetscCallCUDA(cudaFree(d_c));
820 
821   PetscFunctionReturn(PETSC_SUCCESS);
822 }
823 
824 ///////////////////////////////////////////////////////////////////////////
825 //Print Results to Screen and File
826 ///////////////////////////////////////////////////////////////////////////
827 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) {
828   PetscErrorCode ierr;
829   PetscInt       j, k;
830   float          avgtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
831   float          maxtime[8]          = {0., 0., 0., 0., 0., 0., 0., 0.};
832   float          mintime[8]          = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30};
833   // char           *label[8]           = {"Copy:      ", "Copy Opt.: ", "Scale:     ", "Scale Opt: ", "Add:       ", "Add Opt:   ", "Triad:     ", "Triad Opt: "};
834   const float    bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N};
835   double         rate, irate;
836   int            rank, size;
837   PetscFunctionBegin;
838   PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
839   PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size));
840   /* --- SUMMARY --- */
841   for (k = 0; k < NTIMES; ++k) {
842     for (j = 0; j < 8; ++j) {
843       avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
844       mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
845       maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
846     }
847   }
848   for (j = 0; j < 8; ++j) { avgtime[j] = avgtime[j] / (float)(NTIMES - 1); }
849   j     = 7;
850   irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j];
851   ierr  = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
852   if (rank == 0) {
853     FILE *fd;
854     if (size == 1) {
855       printf("%d %11.4f   Rate (MB/s)\n", size, rate);
856       fd = fopen("flops", "w");
857       fprintf(fd, "%g\n", rate);
858       fclose(fd);
859     } else {
860       double prate;
861       fd = fopen("flops", "r");
862       fscanf(fd, "%lg", &prate);
863       fclose(fd);
864       printf("%d %11.4f   Rate (MB/s) %g \n", size, rate, rate / prate);
865     }
866   }
867 
868   PetscFunctionReturn(PETSC_SUCCESS);
869 }
870