1 /*
2 STREAM benchmark implementation in CUDA.
3
4 COPY: a(i) = b(i)
5 SCALE: a(i) = q*b(i)
6 SUM: a(i) = b(i) + c(i)
7 TRIAD: a(i) = b(i) + q*c(i)
8
9 It measures the memory system on the device.
10 The implementation is in double precision with a single option.
11
12 Code based on the code developed by John D. McCalpin
13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14
15 Written by: Massimiliano Fatica, NVIDIA Corporation
16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17 Extensive Revisions, 4 December 2010
18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19
20 User interface motivated by bandwidthTest NVIDIA SDK example.
21 */
22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23
24 #include <petscconf.h>
25 #include <petscsys.h>
26 #include <petsctime.h>
27 #include <petscdevice_cuda.h>
28
29 #define N 10000000
30 #define NTIMES 10
31
32 #if !defined(MIN)
33 #define MIN(x, y) ((x) < (y) ? (x) : (y))
34 #endif
35 #if !defined(MAX)
36 #define MAX(x, y) ((x) > (y) ? (x) : (y))
37 #endif
38
39 const float flt_eps = 1.192092896e-07f;
40 const double dbl_eps = 2.2204460492503131e-16;
41
set_array(float * a,float value,size_t len)42 __global__ void set_array(float *a, float value, size_t len)
43 {
44 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45 while (idx < len) {
46 a[idx] = value;
47 idx += blockDim.x * gridDim.x;
48 }
49 }
50
set_array_double(double * a,double value,size_t len)51 __global__ void set_array_double(double *a, double value, size_t len)
52 {
53 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54 while (idx < len) {
55 a[idx] = value;
56 idx += blockDim.x * gridDim.x;
57 }
58 }
59
STREAM_Copy(float * a,float * b,size_t len)60 __global__ void STREAM_Copy(float *a, float *b, size_t len)
61 {
62 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63 while (idx < len) {
64 b[idx] = a[idx];
65 idx += blockDim.x * gridDim.x;
66 }
67 }
68
STREAM_Copy_double(double * a,double * b,size_t len)69 __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
70 {
71 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72 while (idx < len) {
73 b[idx] = a[idx];
74 idx += blockDim.x * gridDim.x;
75 }
76 }
77
STREAM_Copy_Optimized(float * a,float * b,size_t len)78 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
79 {
80 /*
81 * Ensure size of thread index space is as large as or greater than
82 * vector index space else return.
83 */
84 if (blockDim.x * gridDim.x < len) return;
85 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86 if (idx < len) b[idx] = a[idx];
87 }
88
STREAM_Copy_Optimized_double(double * a,double * b,size_t len)89 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
90 {
91 /*
92 * Ensure size of thread index space is as large as or greater than
93 * vector index space else return.
94 */
95 if (blockDim.x * gridDim.x < len) return;
96 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97 if (idx < len) b[idx] = a[idx];
98 }
99
STREAM_Scale(float * a,float * b,float scale,size_t len)100 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len)
101 {
102 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103 while (idx < len) {
104 b[idx] = scale * a[idx];
105 idx += blockDim.x * gridDim.x;
106 }
107 }
108
STREAM_Scale_double(double * a,double * b,double scale,size_t len)109 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len)
110 {
111 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112 while (idx < len) {
113 b[idx] = scale * a[idx];
114 idx += blockDim.x * gridDim.x;
115 }
116 }
117
STREAM_Scale_Optimized(float * a,float * b,float scale,size_t len)118 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len)
119 {
120 /*
121 * Ensure size of thread index space is as large as or greater than
122 * vector index space else return.
123 */
124 if (blockDim.x * gridDim.x < len) return;
125 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126 if (idx < len) b[idx] = scale * a[idx];
127 }
128
STREAM_Scale_Optimized_double(double * a,double * b,double scale,size_t len)129 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len)
130 {
131 /*
132 * Ensure size of thread index space is as large as or greater than
133 * vector index space else return.
134 */
135 if (blockDim.x * gridDim.x < len) return;
136 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137 if (idx < len) b[idx] = scale * a[idx];
138 }
139
STREAM_Add(float * a,float * b,float * c,size_t len)140 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len)
141 {
142 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143 while (idx < len) {
144 c[idx] = a[idx] + b[idx];
145 idx += blockDim.x * gridDim.x;
146 }
147 }
148
STREAM_Add_double(double * a,double * b,double * c,size_t len)149 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len)
150 {
151 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152 while (idx < len) {
153 c[idx] = a[idx] + b[idx];
154 idx += blockDim.x * gridDim.x;
155 }
156 }
157
STREAM_Add_Optimized(float * a,float * b,float * c,size_t len)158 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len)
159 {
160 /*
161 * Ensure size of thread index space is as large as or greater than
162 * vector index space else return.
163 */
164 if (blockDim.x * gridDim.x < len) return;
165 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166 if (idx < len) c[idx] = a[idx] + b[idx];
167 }
168
STREAM_Add_Optimized_double(double * a,double * b,double * c,size_t len)169 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len)
170 {
171 /*
172 * Ensure size of thread index space is as large as or greater than
173 * vector index space else return.
174 */
175 if (blockDim.x * gridDim.x < len) return;
176 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177 if (idx < len) c[idx] = a[idx] + b[idx];
178 }
179
STREAM_Triad(float * a,float * b,float * c,float scalar,size_t len)180 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
181 {
182 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183 while (idx < len) {
184 c[idx] = a[idx] + scalar * b[idx];
185 idx += blockDim.x * gridDim.x;
186 }
187 }
188
STREAM_Triad_double(double * a,double * b,double * c,double scalar,size_t len)189 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
190 {
191 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192 while (idx < len) {
193 c[idx] = a[idx] + scalar * b[idx];
194 idx += blockDim.x * gridDim.x;
195 }
196 }
197
STREAM_Triad_Optimized(float * a,float * b,float * c,float scalar,size_t len)198 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
199 {
200 /*
201 * Ensure size of thread index space is as large as or greater than
202 * vector index space else return.
203 */
204 if (blockDim.x * gridDim.x < len) return;
205 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206 if (idx < len) c[idx] = a[idx] + scalar * b[idx];
207 }
208
STREAM_Triad_Optimized_double(double * a,double * b,double * c,double scalar,size_t len)209 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
210 {
211 /*
212 * Ensure size of thread index space is as large as or greater than
213 * vector index space else return.
214 */
215 if (blockDim.x * gridDim.x < len) return;
216 size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217 if (idx < len) c[idx] = a[idx] + scalar * b[idx];
218 }
219
220 /* Host side verification routines */
STREAM_Copy_verify(float * a,float * b,size_t len)221 bool STREAM_Copy_verify(float *a, float *b, size_t len)
222 {
223 size_t idx;
224 bool bDifferent = false;
225
226 for (idx = 0; idx < len && !bDifferent; idx++) {
227 float expectedResult = a[idx];
228 float diffResultExpected = (b[idx] - expectedResult);
229 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
230 /* element-wise relative error determination */
231 bDifferent = (relErrorULPS > 2.f);
232 }
233
234 return bDifferent;
235 }
236
STREAM_Copy_verify_double(double * a,double * b,size_t len)237 bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
238 {
239 size_t idx;
240 bool bDifferent = false;
241
242 for (idx = 0; idx < len && !bDifferent; idx++) {
243 double expectedResult = a[idx];
244 double diffResultExpected = (b[idx] - expectedResult);
245 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps;
246 /* element-wise relative error determination */
247 bDifferent = (relErrorULPS > 2.);
248 }
249
250 return bDifferent;
251 }
252
STREAM_Scale_verify(float * a,float * b,float scale,size_t len)253 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
254 {
255 size_t idx;
256 bool bDifferent = false;
257
258 for (idx = 0; idx < len && !bDifferent; idx++) {
259 float expectedResult = scale * a[idx];
260 float diffResultExpected = (b[idx] - expectedResult);
261 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
262 /* element-wise relative error determination */
263 bDifferent = (relErrorULPS > 2.f);
264 }
265
266 return bDifferent;
267 }
268
STREAM_Scale_verify_double(double * a,double * b,double scale,size_t len)269 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
270 {
271 size_t idx;
272 bool bDifferent = false;
273
274 for (idx = 0; idx < len && !bDifferent; idx++) {
275 double expectedResult = scale * a[idx];
276 double diffResultExpected = (b[idx] - expectedResult);
277 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
278 /* element-wise relative error determination */
279 bDifferent = (relErrorULPS > 2.);
280 }
281
282 return bDifferent;
283 }
284
STREAM_Add_verify(float * a,float * b,float * c,size_t len)285 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
286 {
287 size_t idx;
288 bool bDifferent = false;
289
290 for (idx = 0; idx < len && !bDifferent; idx++) {
291 float expectedResult = a[idx] + b[idx];
292 float diffResultExpected = (c[idx] - expectedResult);
293 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
294 /* element-wise relative error determination */
295 bDifferent = (relErrorULPS > 2.f);
296 }
297
298 return bDifferent;
299 }
300
STREAM_Add_verify_double(double * a,double * b,double * c,size_t len)301 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
302 {
303 size_t idx;
304 bool bDifferent = false;
305
306 for (idx = 0; idx < len && !bDifferent; idx++) {
307 double expectedResult = a[idx] + b[idx];
308 double diffResultExpected = (c[idx] - expectedResult);
309 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
310 /* element-wise relative error determination */
311 bDifferent = (relErrorULPS > 2.);
312 }
313
314 return bDifferent;
315 }
316
STREAM_Triad_verify(float * a,float * b,float * c,float scalar,size_t len)317 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
318 {
319 size_t idx;
320 bool bDifferent = false;
321
322 for (idx = 0; idx < len && !bDifferent; idx++) {
323 float expectedResult = a[idx] + scalar * b[idx];
324 float diffResultExpected = (c[idx] - expectedResult);
325 float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
326 /* element-wise relative error determination */
327 bDifferent = (relErrorULPS > 3.f);
328 }
329
330 return bDifferent;
331 }
332
STREAM_Triad_verify_double(double * a,double * b,double * c,double scalar,size_t len)333 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
334 {
335 size_t idx;
336 bool bDifferent = false;
337
338 for (idx = 0; idx < len && !bDifferent; idx++) {
339 double expectedResult = a[idx] + scalar * b[idx];
340 double diffResultExpected = (c[idx] - expectedResult);
341 double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
342 /* element-wise relative error determination */
343 bDifferent = (relErrorULPS > 3.);
344 }
345
346 return bDifferent;
347 }
348
349 /* forward declarations */
350 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
353 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354
main(int argc,char * argv[])355 int main(int argc, char *argv[])
356 {
357 PetscInt device = 0;
358 PetscBool runDouble = PETSC_TRUE;
359 const PetscBool cpuTiming = PETSC_TRUE; // must be true
360 PetscErrorCode ierr;
361
362 PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
363
364 PetscCall(PetscInitialize(&argc, &argv, 0, help));
365
366 PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
367 PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0));
368 PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL));
369 PetscOptionsEnd();
370
371 ierr = setupStream(device, runDouble, cpuTiming);
372 if (ierr) PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
373 PetscCall(PetscFinalize());
374 return 0;
375 }
376
377 ///////////////////////////////////////////////////////////////////////////////
378 //Run the appropriate tests
379 ///////////////////////////////////////////////////////////////////////////////
setupStream(PetscInt deviceNum,PetscBool runDouble,PetscBool cpuTiming)380 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
381 {
382 PetscInt iNumThreadsPerBlock = 128;
383
384 PetscFunctionBegin;
385 // Check device
386 {
387 int deviceCount;
388
389 cudaGetDeviceCount(&deviceCount);
390 if (deviceCount == 0) {
391 PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
392 return -1000;
393 }
394
395 if (deviceNum >= deviceCount || deviceNum < 0) {
396 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
397 deviceNum = 0;
398 }
399 }
400
401 cudaSetDevice(deviceNum);
402 // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
403 cudaDeviceProp deviceProp;
404 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
405 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
406 return -1;
407 }
408
409 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
410 PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
411 return -1;
412 }
413 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
414 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
415
416 if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
417 else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
418 PetscFunctionReturn(PETSC_SUCCESS);
419 }
420
421 ///////////////////////////////////////////////////////////////////////////
422 // runStream
423 ///////////////////////////////////////////////////////////////////////////
runStream(const PetscInt iNumThreadsPerBlock,PetscBool bDontUseGPUTiming)424 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
425 {
426 float *d_a, *d_b, *d_c;
427 int k;
428 float times[8][NTIMES];
429 float scalar;
430
431 PetscFunctionBegin;
432 /* Allocate memory on device */
433 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N));
434 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N));
435 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N));
436
437 /* Compute execution configuration */
438
439 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
440 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */
441 if (N % dimBlock.x != 0) dimGrid.x += 1;
442
443 /* Initialize memory on the device */
444 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
445 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
446 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
447
448 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
449 PetscLogDouble cpuTimer = 0.0;
450
451 scalar = 3.0f;
452 for (k = 0; k < NTIMES; ++k) {
453 PetscTimeSubtract(&cpuTimer);
454 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
455 cudaStreamSynchronize(NULL);
456 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
457 PetscTimeAdd(&cpuTimer);
458 if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec
459
460 cpuTimer = 0.0;
461 PetscTimeSubtract(&cpuTimer);
462 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
463 cudaStreamSynchronize(NULL);
464 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
465 //get the total elapsed time in ms
466 PetscTimeAdd(&cpuTimer);
467 if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3;
468
469 cpuTimer = 0.0;
470 PetscTimeSubtract(&cpuTimer);
471 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
472 cudaStreamSynchronize(NULL);
473 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
474 //get the total elapsed time in ms
475 PetscTimeAdd(&cpuTimer);
476 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
477
478 cpuTimer = 0.0;
479 PetscTimeSubtract(&cpuTimer);
480 STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
481 cudaStreamSynchronize(NULL);
482 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
483 //get the total elapsed time in ms
484 PetscTimeAdd(&cpuTimer);
485 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
486
487 cpuTimer = 0.0;
488 PetscTimeSubtract(&cpuTimer);
489 // PetscCallCUDA(cudaEventRecord(start, 0));
490 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
491 cudaStreamSynchronize(NULL);
492 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
493 PetscCallCUDA(cudaEventRecord(stop, 0));
494 // PetscCallCUDA(cudaEventSynchronize(stop));
495 //get the total elapsed time in ms
496 PetscTimeAdd(&cpuTimer);
497 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
498 else {
499 // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop));
500 }
501
502 cpuTimer = 0.0;
503 PetscTimeSubtract(&cpuTimer);
504 STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
505 cudaStreamSynchronize(NULL);
506 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
507 //get the total elapsed time in ms
508 PetscTimeAdd(&cpuTimer);
509 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
510
511 cpuTimer = 0.0;
512 PetscTimeSubtract(&cpuTimer);
513 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
514 cudaStreamSynchronize(NULL);
515 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
516 //get the total elapsed time in ms
517 PetscTimeAdd(&cpuTimer);
518 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
519
520 cpuTimer = 0.0;
521 PetscTimeSubtract(&cpuTimer);
522 STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
523 cudaStreamSynchronize(NULL);
524 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
525 //get the total elapsed time in ms
526 PetscTimeAdd(&cpuTimer);
527 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
528 }
529
530 if (1) { /* verify kernels */
531 float *h_a, *h_b, *h_c;
532 bool errorSTREAMkernel = true;
533
534 if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
535 printf("Unable to allocate array h_a, exiting ...\n");
536 exit(1);
537 }
538 if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
539 printf("Unable to allocate array h_b, exiting ...\n");
540 exit(1);
541 }
542
543 if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
544 printf("Unalbe to allocate array h_c, exiting ...\n");
545 exit(1);
546 }
547
548 /*
549 * perform kernel, copy device memory into host memory and verify each
550 * device kernel output
551 */
552
553 /* Initialize memory on the device */
554 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
555 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
556 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
557
558 STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
559 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
560 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
561 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
562 if (errorSTREAMkernel) {
563 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
564 exit(-2000);
565 }
566
567 /* Initialize memory on the device */
568 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
569 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
570 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
571
572 STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
573 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
574 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
575 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
576 if (errorSTREAMkernel) {
577 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
578 exit(-3000);
579 }
580
581 /* Initialize memory on the device */
582 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
583 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
584 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
585
586 STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
587 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
588 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
589 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
590 if (errorSTREAMkernel) {
591 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
592 exit(-4000);
593 }
594
595 /* Initialize memory on the device */
596 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
597 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
598 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
599
600 STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
601 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
602 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
603 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
604 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
605 if (errorSTREAMkernel) {
606 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
607 exit(-5000);
608 }
609
610 /* Initialize memory on the device */
611 set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
612 set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
613 set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
614
615 STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
616 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
617 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
618 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
619 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
620 if (errorSTREAMkernel) {
621 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
622 exit(-6000);
623 }
624
625 free(h_a);
626 free(h_b);
627 free(h_c);
628 }
629 /* continue from here */
630 printResultsReadable(times, sizeof(float));
631
632 /* Free memory on device */
633 PetscCallCUDA(cudaFree(d_a));
634 PetscCallCUDA(cudaFree(d_b));
635 PetscCallCUDA(cudaFree(d_c));
636 PetscFunctionReturn(PETSC_SUCCESS);
637 }
638
runStreamDouble(const PetscInt iNumThreadsPerBlock,PetscBool bDontUseGPUTiming)639 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
640 {
641 double *d_a, *d_b, *d_c;
642 int k;
643 float times[8][NTIMES];
644 double scalar;
645
646 PetscFunctionBegin;
647 /* Allocate memory on device */
648 PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N));
649 PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N));
650 PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N));
651
652 /* Compute execution configuration */
653
654 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
655 dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */
656 if (N % dimBlock.x != 0) dimGrid.x += 1;
657
658 /* Initialize memory on the device */
659 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
660 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
661 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
662
663 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
664 PetscLogDouble cpuTimer = 0.0;
665
666 scalar = 3.0;
667 for (k = 0; k < NTIMES; ++k) {
668 PetscTimeSubtract(&cpuTimer);
669 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
670 cudaStreamSynchronize(NULL);
671 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
672 //get the total elapsed time in ms
673 if (bDontUseGPUTiming) {
674 PetscTimeAdd(&cpuTimer);
675 times[0][k] = cpuTimer * 1.e3;
676 }
677
678 cpuTimer = 0.0;
679 PetscTimeSubtract(&cpuTimer);
680 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
681 cudaStreamSynchronize(NULL);
682 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
683 //get the total elapsed time in ms
684 if (bDontUseGPUTiming) {
685 PetscTimeAdd(&cpuTimer);
686 times[1][k] = cpuTimer * 1.e3;
687 }
688
689 cpuTimer = 0.0;
690 PetscTimeSubtract(&cpuTimer);
691 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
692 cudaStreamSynchronize(NULL);
693 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
694 //get the total elapsed time in ms
695 PetscTimeAdd(&cpuTimer);
696 if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
697
698 cpuTimer = 0.0;
699 PetscTimeSubtract(&cpuTimer);
700 STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
701 cudaStreamSynchronize(NULL);
702 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
703 //get the total elapsed time in ms
704 PetscTimeAdd(&cpuTimer);
705 if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
706
707 cpuTimer = 0.0;
708 PetscTimeSubtract(&cpuTimer);
709 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
710 cudaStreamSynchronize(NULL);
711 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
712 //get the total elapsed time in ms
713 PetscTimeAdd(&cpuTimer);
714 if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
715
716 cpuTimer = 0.0;
717 PetscTimeSubtract(&cpuTimer);
718 STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
719 cudaStreamSynchronize(NULL);
720 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
721 //get the total elapsed time in ms
722 PetscTimeAdd(&cpuTimer);
723 if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
724
725 cpuTimer = 0.0;
726 PetscTimeSubtract(&cpuTimer);
727 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
728 cudaStreamSynchronize(NULL);
729 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
730 //get the total elapsed time in ms
731 PetscTimeAdd(&cpuTimer);
732 if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
733
734 cpuTimer = 0.0;
735 PetscTimeSubtract(&cpuTimer);
736 STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
737 cudaStreamSynchronize(NULL);
738 PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
739 //get the total elapsed time in ms
740 PetscTimeAdd(&cpuTimer);
741 if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
742 }
743
744 if (1) { /* verify kernels */
745 double *h_a, *h_b, *h_c;
746 bool errorSTREAMkernel = true;
747
748 if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
749 printf("Unable to allocate array h_a, exiting ...\n");
750 exit(1);
751 }
752 if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
753 printf("Unable to allocate array h_b, exiting ...\n");
754 exit(1);
755 }
756
757 if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
758 printf("Unalbe to allocate array h_c, exiting ...\n");
759 exit(1);
760 }
761
762 /*
763 * perform kernel, copy device memory into host memory and verify each
764 * device kernel output
765 */
766
767 /* Initialize memory on the device */
768 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
769 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
770 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
771
772 STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
773 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
774 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
775 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
776 if (errorSTREAMkernel) {
777 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
778 exit(-2000);
779 }
780
781 /* Initialize memory on the device */
782 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
783 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
784 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
785
786 STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
787 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
788 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
789 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
790 if (errorSTREAMkernel) {
791 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
792 exit(-3000);
793 }
794
795 /* Initialize memory on the device */
796 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
797 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
798
799 STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
800 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
801 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
802 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
803 if (errorSTREAMkernel) {
804 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
805 exit(-4000);
806 }
807
808 /* Initialize memory on the device */
809 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
810 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
811 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
812
813 STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
814 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
815 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
816 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
817 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
818 if (errorSTREAMkernel) {
819 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
820 exit(-5000);
821 }
822
823 /* Initialize memory on the device */
824 set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
825 set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
826 set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
827
828 STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
829 PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
830 PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
831 PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
832 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
833 if (errorSTREAMkernel) {
834 PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
835 exit(-6000);
836 }
837
838 free(h_a);
839 free(h_b);
840 free(h_c);
841 }
842 /* continue from here */
843 printResultsReadable(times, sizeof(double));
844
845 /* Free memory on device */
846 PetscCallCUDA(cudaFree(d_a));
847 PetscCallCUDA(cudaFree(d_b));
848 PetscCallCUDA(cudaFree(d_c));
849 PetscFunctionReturn(PETSC_SUCCESS);
850 }
851
852 ///////////////////////////////////////////////////////////////////////////
853 //Print Results to Screen and File
854 ///////////////////////////////////////////////////////////////////////////
printResultsReadable(float times[][NTIMES],const size_t bsize)855 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
856 {
857 PetscErrorCode ierr;
858 PetscInt j, k;
859 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
860 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
861 float mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30};
862 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "};
863 const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N};
864 double rate, irate;
865 int rank, size;
866
867 PetscFunctionBegin;
868 PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
869 PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size));
870 /* --- SUMMARY --- */
871 for (k = 0; k < NTIMES; ++k) {
872 for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) {
873 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
874 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
875 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
876 }
877 }
878 for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) avgtime[j] = avgtime[j] / (float)(NTIMES - 1);
879 j = 7;
880 irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j];
881 ierr = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
882 if (rank == 0) {
883 FILE *fd;
884 if (size == 1) {
885 printf("%d %11.4f Rate (MB/s)\n", size, rate);
886 fd = fopen("flops", "w");
887 fprintf(fd, "%g\n", rate);
888 fclose(fd);
889 } else {
890 double prate;
891 fd = fopen("flops", "r");
892 fscanf(fd, "%lg", &prate);
893 fclose(fd);
894 printf("%d %11.4f Rate (MB/s) %g\n", size, rate, rate / prate);
895 }
896 }
897 PetscFunctionReturn(PETSC_SUCCESS);
898 }
899