1 /* 2 STREAM benchmark implementation in CUDA. 3 4 COPY: a(i) = b(i) 5 SCALE: a(i) = q*b(i) 6 SUM: a(i) = b(i) + c(i) 7 TRIAD: a(i) = b(i) + q*c(i) 8 9 It measures the memory system on the device. 10 The implementation is in double precision with a single option. 11 12 Code based on the code developed by John D. McCalpin 13 http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14 15 Written by: Massimiliano Fatica, NVIDIA Corporation 16 Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17 Extensive Revisions, 4 December 2010 18 Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19 20 User interface motivated by bandwidthTest NVIDIA SDK example. 21 */ 22 static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23 24 #include <petscconf.h> 25 #include <petscsys.h> 26 #include <petsctime.h> 27 28 #define N 10000000 29 #define NTIMES 10 30 31 # ifndef MIN 32 # define MIN(x,y) ((x)<(y) ? (x) : (y)) 33 # endif 34 # ifndef MAX 35 # define MAX(x,y) ((x)>(y) ? (x) : (y)) 36 # endif 37 38 const float flt_eps = 1.192092896e-07f; 39 const double dbl_eps = 2.2204460492503131e-16; 40 41 __global__ void set_array(float *a, float value, size_t len) 42 { 43 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44 while (idx < len) { 45 a[idx] = value; 46 idx += blockDim.x * gridDim.x; 47 } 48 } 49 50 __global__ void set_array_double(double *a, double value, size_t len) 51 { 52 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 53 while (idx < len) { 54 a[idx] = value; 55 idx += blockDim.x * gridDim.x; 56 } 57 } 58 59 __global__ void STREAM_Copy(float *a, float *b, size_t len) 60 { 61 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 62 while (idx < len) { 63 b[idx] = a[idx]; 64 idx += blockDim.x * gridDim.x; 65 } 66 } 67 68 __global__ void STREAM_Copy_double(double *a, double *b, size_t len) 69 { 70 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 71 while (idx < len) { 72 b[idx] = a[idx]; 73 idx += blockDim.x * gridDim.x; 74 } 75 } 76 77 __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) 78 { 79 /* 80 * Ensure size of thread index space is as large as or greater than 81 * vector index space else return. 82 */ 83 if (blockDim.x * gridDim.x < len) return; 84 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 85 if (idx < len) b[idx] = a[idx]; 86 } 87 88 __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) 89 { 90 /* 91 * Ensure size of thread index space is as large as or greater than 92 * vector index space else return. 93 */ 94 if (blockDim.x * gridDim.x < len) return; 95 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 96 if (idx < len) b[idx] = a[idx]; 97 } 98 99 __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) 100 { 101 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 102 while (idx < len) { 103 b[idx] = scale* a[idx]; 104 idx += blockDim.x * gridDim.x; 105 } 106 } 107 108 __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) 109 { 110 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 111 while (idx < len) { 112 b[idx] = scale* a[idx]; 113 idx += blockDim.x * gridDim.x; 114 } 115 } 116 117 __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) 118 { 119 /* 120 * Ensure size of thread index space is as large as or greater than 121 * vector index space else return. 122 */ 123 if (blockDim.x * gridDim.x < len) return; 124 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 125 if (idx < len) b[idx] = scale* a[idx]; 126 } 127 128 __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) 129 { 130 /* 131 * Ensure size of thread index space is as large as or greater than 132 * vector index space else return. 133 */ 134 if (blockDim.x * gridDim.x < len) return; 135 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 136 if (idx < len) b[idx] = scale* a[idx]; 137 } 138 139 __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) 140 { 141 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 142 while (idx < len) { 143 c[idx] = a[idx]+b[idx]; 144 idx += blockDim.x * gridDim.x; 145 } 146 } 147 148 __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) 149 { 150 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 151 while (idx < len) { 152 c[idx] = a[idx]+b[idx]; 153 idx += blockDim.x * gridDim.x; 154 } 155 } 156 157 __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) 158 { 159 /* 160 * Ensure size of thread index space is as large as or greater than 161 * vector index space else return. 162 */ 163 if (blockDim.x * gridDim.x < len) return; 164 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 165 if (idx < len) c[idx] = a[idx]+b[idx]; 166 } 167 168 __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) 169 { 170 /* 171 * Ensure size of thread index space is as large as or greater than 172 * vector index space else return. 173 */ 174 if (blockDim.x * gridDim.x < len) return; 175 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 176 if (idx < len) c[idx] = a[idx]+b[idx]; 177 } 178 179 __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) 180 { 181 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 182 while (idx < len) { 183 c[idx] = a[idx]+scalar*b[idx]; 184 idx += blockDim.x * gridDim.x; 185 } 186 } 187 188 __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) 189 { 190 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 191 while (idx < len) { 192 c[idx] = a[idx]+scalar*b[idx]; 193 idx += blockDim.x * gridDim.x; 194 } 195 } 196 197 __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) 198 { 199 /* 200 * Ensure size of thread index space is as large as or greater than 201 * vector index space else return. 202 */ 203 if (blockDim.x * gridDim.x < len) return; 204 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 205 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 206 } 207 208 __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) 209 { 210 /* 211 * Ensure size of thread index space is as large as or greater than 212 * vector index space else return. 213 */ 214 if (blockDim.x * gridDim.x < len) return; 215 size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 216 if (idx < len) c[idx] = a[idx]+scalar*b[idx]; 217 } 218 219 /* Host side verification routines */ 220 bool STREAM_Copy_verify(float *a, float *b, size_t len) 221 { 222 size_t idx; 223 bool bDifferent = false; 224 225 for (idx = 0; idx < len && !bDifferent; idx++) { 226 float expectedResult = a[idx]; 227 float diffResultExpected = (b[idx] - expectedResult); 228 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 229 /* element-wise relative error determination */ 230 bDifferent = (relErrorULPS > 2.f); 231 } 232 233 return bDifferent; 234 } 235 236 bool STREAM_Copy_verify_double(double *a, double *b, size_t len) 237 { 238 size_t idx; 239 bool bDifferent = false; 240 241 for (idx = 0; idx < len && !bDifferent; idx++) { 242 double expectedResult = a[idx]; 243 double diffResultExpected = (b[idx] - expectedResult); 244 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/dbl_eps; 245 /* element-wise relative error determination */ 246 bDifferent = (relErrorULPS > 2.); 247 } 248 249 return bDifferent; 250 } 251 252 bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) 253 { 254 size_t idx; 255 bool bDifferent = false; 256 257 for (idx = 0; idx < len && !bDifferent; idx++) { 258 float expectedResult = scale*a[idx]; 259 float diffResultExpected = (b[idx] - expectedResult); 260 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 261 /* element-wise relative error determination */ 262 bDifferent = (relErrorULPS > 2.f); 263 } 264 265 return bDifferent; 266 } 267 268 bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) 269 { 270 size_t idx; 271 bool bDifferent = false; 272 273 for (idx = 0; idx < len && !bDifferent; idx++) { 274 double expectedResult = scale*a[idx]; 275 double diffResultExpected = (b[idx] - expectedResult); 276 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 277 /* element-wise relative error determination */ 278 bDifferent = (relErrorULPS > 2.); 279 } 280 281 return bDifferent; 282 } 283 284 bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) 285 { 286 size_t idx; 287 bool bDifferent = false; 288 289 for (idx = 0; idx < len && !bDifferent; idx++) { 290 float expectedResult = a[idx] + b[idx]; 291 float diffResultExpected = (c[idx] - expectedResult); 292 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 293 /* element-wise relative error determination */ 294 bDifferent = (relErrorULPS > 2.f); 295 } 296 297 return bDifferent; 298 } 299 300 bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) 301 { 302 size_t idx; 303 bool bDifferent = false; 304 305 for (idx = 0; idx < len && !bDifferent; idx++) { 306 double expectedResult = a[idx] + b[idx]; 307 double diffResultExpected = (c[idx] - expectedResult); 308 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 309 /* element-wise relative error determination */ 310 bDifferent = (relErrorULPS > 2.); 311 } 312 313 return bDifferent; 314 } 315 316 bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) 317 { 318 size_t idx; 319 bool bDifferent = false; 320 321 for (idx = 0; idx < len && !bDifferent; idx++) { 322 float expectedResult = a[idx] + scalar*b[idx]; 323 float diffResultExpected = (c[idx] - expectedResult); 324 float relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 325 /* element-wise relative error determination */ 326 bDifferent = (relErrorULPS > 3.f); 327 } 328 329 return bDifferent; 330 } 331 332 bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) 333 { 334 size_t idx; 335 bool bDifferent = false; 336 337 for (idx = 0; idx < len && !bDifferent; idx++) { 338 double expectedResult = a[idx] + scalar*b[idx]; 339 double diffResultExpected = (c[idx] - expectedResult); 340 double relErrorULPS = (fabsf(diffResultExpected)/fabsf(expectedResult))/flt_eps; 341 /* element-wise relative error determination */ 342 bDifferent = (relErrorULPS > 3.); 343 } 344 345 return bDifferent; 346 } 347 348 /* forward declarations */ 349 PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 350 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 351 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 352 PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 353 354 int main(int argc, char *argv[]) 355 { 356 PetscInt device = 0; 357 PetscBool runDouble = PETSC_TRUE; 358 const PetscBool cpuTiming = PETSC_TRUE; // must be true 359 PetscErrorCode ierr; 360 361 ierr = cudaSetDeviceFlags(cudaDeviceBlockingSync);CHKERRQ(ierr); 362 363 ierr = PetscInitialize(&argc, &argv, 0, help);if (ierr) return ierr; 364 365 ierr = PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");CHKERRQ(ierr); 366 ierr = PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL,0);CHKERRQ(ierr); 367 ierr = PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL);CHKERRQ(ierr); 368 ierr = PetscOptionsEnd(); 369 370 ierr = setupStream(device, runDouble, cpuTiming); 371 if (ierr) { 372 ierr = PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED");CHKERRQ(ierr); 373 } 374 ierr = PetscFinalize(); 375 return ierr; 376 } 377 378 /////////////////////////////////////////////////////////////////////////////// 379 //Run the appropriate tests 380 /////////////////////////////////////////////////////////////////////////////// 381 PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) 382 { 383 PetscInt iNumThreadsPerBlock = 128; 384 PetscErrorCode ierr; 385 386 PetscFunctionBegin; 387 // Check device 388 { 389 int deviceCount; 390 391 cudaGetDeviceCount(&deviceCount); 392 if (deviceCount == 0) { 393 ierr = PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n");CHKERRQ(ierr); 394 return -1000; 395 } 396 397 if (deviceNum >= deviceCount || deviceNum < 0) { 398 ierr = PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0);CHKERRQ(ierr); 399 deviceNum = 0; 400 } 401 } 402 403 cudaSetDevice(deviceNum); 404 // ierr = PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n");CHKERRQ(ierr); 405 cudaDeviceProp deviceProp; 406 if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 407 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n");CHKERRQ(ierr); 408 return -1; 409 } 410 411 if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 412 ierr = PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n");CHKERRQ(ierr); 413 return -1; 414 } 415 if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 416 else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 417 418 if (runDouble) { 419 ierr = runStreamDouble(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 420 } else { 421 ierr = runStream(iNumThreadsPerBlock, cpuTiming);CHKERRQ(ierr); 422 } 423 PetscFunctionReturn(0); 424 } 425 426 /////////////////////////////////////////////////////////////////////////// 427 // runStream 428 /////////////////////////////////////////////////////////////////////////// 429 PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 430 { 431 float *d_a, *d_b, *d_c; 432 int k; 433 float times[8][NTIMES]; 434 float scalar; 435 PetscErrorCode ierr; 436 437 PetscFunctionBegin; 438 /* Allocate memory on device */ 439 ierr = cudaMalloc((void**)&d_a, sizeof(float)*N);CHKERRQ(ierr); 440 ierr = cudaMalloc((void**)&d_b, sizeof(float)*N);CHKERRQ(ierr); 441 ierr = cudaMalloc((void**)&d_c, sizeof(float)*N);CHKERRQ(ierr); 442 443 /* Compute execution configuration */ 444 445 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 446 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 447 if (N % dimBlock.x != 0) dimGrid.x+=1; 448 449 /* Initialize memory on the device */ 450 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 451 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 452 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 453 454 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 455 PetscLogDouble cpuTimer = 0.0; 456 457 scalar=3.0f; 458 for (k = 0; k < NTIMES; ++k) { 459 PetscTimeSubtract(&cpuTimer); 460 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 461 cudaStreamSynchronize(NULL); 462 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 463 PetscTimeAdd(&cpuTimer); 464 if (bDontUseGPUTiming) times[0][k] = cpuTimer*1.e3; // millisec 465 466 cpuTimer = 0.0; 467 PetscTimeSubtract(&cpuTimer); 468 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 469 cudaStreamSynchronize(NULL); 470 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 471 //get the total elapsed time in ms 472 PetscTimeAdd(&cpuTimer); 473 if (bDontUseGPUTiming) times[1][k] = cpuTimer*1.e3; 474 475 cpuTimer = 0.0; 476 PetscTimeSubtract(&cpuTimer); 477 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 478 cudaStreamSynchronize(NULL); 479 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 480 //get the total elapsed time in ms 481 PetscTimeAdd(&cpuTimer); 482 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 483 484 cpuTimer = 0.0; 485 PetscTimeSubtract(&cpuTimer); 486 STREAM_Scale_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 487 cudaStreamSynchronize(NULL); 488 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 489 //get the total elapsed time in ms 490 PetscTimeAdd(&cpuTimer); 491 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 492 493 cpuTimer = 0.0; 494 PetscTimeSubtract(&cpuTimer); 495 // ierr = cudaEventRecord(start, 0);CHKERRQ(ierr); 496 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 497 cudaStreamSynchronize(NULL); 498 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 499 ierr = cudaEventRecord(stop, 0);CHKERRQ(ierr); 500 // ierr = cudaEventSynchronize(stop);CHKERRQ(ierr); 501 //get the total elapsed time in ms 502 PetscTimeAdd(&cpuTimer); 503 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 504 else { 505 // ierr = cudaEventElapsedTime(×[4][k], start, stop);CHKERRQ(ierr); 506 } 507 508 cpuTimer = 0.0; 509 PetscTimeSubtract(&cpuTimer); 510 STREAM_Add_Optimized<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 511 cudaStreamSynchronize(NULL); 512 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 513 //get the total elapsed time in ms 514 PetscTimeAdd(&cpuTimer); 515 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 516 517 cpuTimer = 0.0; 518 PetscTimeSubtract(&cpuTimer); 519 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 520 cudaStreamSynchronize(NULL); 521 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 522 //get the total elapsed time in ms 523 PetscTimeAdd(&cpuTimer); 524 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 525 526 cpuTimer = 0.0; 527 PetscTimeSubtract(&cpuTimer); 528 STREAM_Triad_Optimized<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 529 cudaStreamSynchronize(NULL); 530 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 531 //get the total elapsed time in ms 532 PetscTimeAdd(&cpuTimer); 533 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 534 } 535 536 if (1) { /* verify kernels */ 537 float *h_a, *h_b, *h_c; 538 bool errorSTREAMkernel = true; 539 540 if ((h_a = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 541 printf("Unable to allocate array h_a, exiting ...\n"); 542 exit(1); 543 } 544 if ((h_b = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 545 printf("Unable to allocate array h_b, exiting ...\n"); 546 exit(1); 547 } 548 549 if ((h_c = (float*)calloc(N, sizeof(float))) == (float*)NULL) { 550 printf("Unalbe to allocate array h_c, exiting ...\n"); 551 exit(1); 552 } 553 554 /* 555 * perform kernel, copy device memory into host memory and verify each 556 * device kernel output 557 */ 558 559 /* Initialize memory on the device */ 560 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 561 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 562 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 563 564 STREAM_Copy<<<dimGrid,dimBlock>>>(d_a, d_c, N); 565 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 566 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 567 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 568 if (errorSTREAMkernel) { 569 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 570 exit(-2000); 571 } 572 573 /* Initialize memory on the device */ 574 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 575 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 576 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 577 578 STREAM_Copy_Optimized<<<dimGrid,dimBlock>>>(d_a, d_c, N); 579 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 580 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 581 errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 582 if (errorSTREAMkernel) { 583 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 584 exit(-3000); 585 } 586 587 /* Initialize memory on the device */ 588 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 589 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 590 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 591 592 STREAM_Scale<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 593 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 594 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 595 errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 596 if (errorSTREAMkernel) { 597 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 598 exit(-4000); 599 } 600 601 /* Initialize memory on the device */ 602 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 603 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 604 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 605 606 STREAM_Add<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 607 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 608 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 609 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 610 errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 611 if (errorSTREAMkernel) { 612 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 613 exit(-5000); 614 } 615 616 /* Initialize memory on the device */ 617 set_array<<<dimGrid,dimBlock>>>(d_a, 2.f, N); 618 set_array<<<dimGrid,dimBlock>>>(d_b, .5f, N); 619 set_array<<<dimGrid,dimBlock>>>(d_c, .5f, N); 620 621 STREAM_Triad<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 622 ierr = cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 623 ierr = cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 624 ierr = cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 625 errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 626 if (errorSTREAMkernel) { 627 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 628 exit(-6000); 629 } 630 631 free(h_a); 632 free(h_b); 633 free(h_c); 634 } 635 /* continue from here */ 636 printResultsReadable(times, sizeof(float)); 637 638 /* Free memory on device */ 639 ierr = cudaFree(d_a);CHKERRQ(ierr); 640 ierr = cudaFree(d_b);CHKERRQ(ierr); 641 ierr = cudaFree(d_c);CHKERRQ(ierr); 642 643 PetscFunctionReturn(0); 644 } 645 646 PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) 647 { 648 double *d_a, *d_b, *d_c; 649 int k; 650 float times[8][NTIMES]; 651 double scalar; 652 PetscErrorCode ierr; 653 654 PetscFunctionBegin; 655 /* Allocate memory on device */ 656 ierr = cudaMalloc((void**)&d_a, sizeof(double)*N);CHKERRQ(ierr); 657 ierr = cudaMalloc((void**)&d_b, sizeof(double)*N);CHKERRQ(ierr); 658 ierr = cudaMalloc((void**)&d_c, sizeof(double)*N);CHKERRQ(ierr); 659 660 /* Compute execution configuration */ 661 662 dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 663 dim3 dimGrid(N/dimBlock.x); /* (N/dimBlock.x,1,1) */ 664 if (N % dimBlock.x != 0) dimGrid.x+=1; 665 666 /* Initialize memory on the device */ 667 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 668 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 669 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 670 671 /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 672 PetscLogDouble cpuTimer = 0.0; 673 674 scalar=3.0; 675 for (k = 0; k < NTIMES; ++k) { 676 PetscTimeSubtract(&cpuTimer); 677 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 678 cudaStreamSynchronize(NULL); 679 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 680 //get the total elapsed time in ms 681 if (bDontUseGPUTiming) { 682 PetscTimeAdd(&cpuTimer); 683 times[0][k] = cpuTimer*1.e3; 684 } 685 686 cpuTimer = 0.0; 687 PetscTimeSubtract(&cpuTimer); 688 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 689 cudaStreamSynchronize(NULL); 690 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 691 //get the total elapsed time in ms 692 if (bDontUseGPUTiming) { 693 PetscTimeAdd(&cpuTimer); 694 times[1][k] = cpuTimer*1.e3; 695 } 696 697 cpuTimer = 0.0; 698 PetscTimeSubtract(&cpuTimer); 699 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 700 cudaStreamSynchronize(NULL); 701 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 702 //get the total elapsed time in ms 703 PetscTimeAdd(&cpuTimer); 704 if (bDontUseGPUTiming) times[2][k] = cpuTimer*1.e3; 705 706 cpuTimer = 0.0; 707 PetscTimeSubtract(&cpuTimer); 708 STREAM_Scale_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 709 cudaStreamSynchronize(NULL); 710 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 711 //get the total elapsed time in ms 712 PetscTimeAdd(&cpuTimer); 713 if (bDontUseGPUTiming) times[3][k] = cpuTimer*1.e3; 714 715 cpuTimer = 0.0; 716 PetscTimeSubtract(&cpuTimer); 717 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 718 cudaStreamSynchronize(NULL); 719 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 720 //get the total elapsed time in ms 721 PetscTimeAdd(&cpuTimer); 722 if (bDontUseGPUTiming) times[4][k] = cpuTimer*1.e3; 723 724 cpuTimer = 0.0; 725 PetscTimeSubtract(&cpuTimer); 726 STREAM_Add_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 727 cudaStreamSynchronize(NULL); 728 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 729 //get the total elapsed time in ms 730 PetscTimeAdd(&cpuTimer); 731 if (bDontUseGPUTiming) times[5][k] = cpuTimer*1.e3; 732 733 cpuTimer = 0.0; 734 PetscTimeSubtract(&cpuTimer); 735 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 736 cudaStreamSynchronize(NULL); 737 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 738 //get the total elapsed time in ms 739 PetscTimeAdd(&cpuTimer); 740 if (bDontUseGPUTiming) times[6][k] = cpuTimer*1.e3; 741 742 cpuTimer = 0.0; 743 PetscTimeSubtract(&cpuTimer); 744 STREAM_Triad_Optimized_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 745 cudaStreamSynchronize(NULL); 746 ierr = MPI_Barrier(MPI_COMM_WORLD);CHKERRMPI(ierr); 747 //get the total elapsed time in ms 748 PetscTimeAdd(&cpuTimer); 749 if (bDontUseGPUTiming) times[7][k] = cpuTimer*1.e3; 750 } 751 752 if (1) { /* verify kernels */ 753 double *h_a, *h_b, *h_c; 754 bool errorSTREAMkernel = true; 755 756 if ((h_a = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 757 printf("Unable to allocate array h_a, exiting ...\n"); 758 exit(1); 759 } 760 if ((h_b = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 761 printf("Unable to allocate array h_b, exiting ...\n"); 762 exit(1); 763 } 764 765 if ((h_c = (double*)calloc(N, sizeof(double))) == (double*)NULL) { 766 printf("Unalbe to allocate array h_c, exiting ...\n"); 767 exit(1); 768 } 769 770 /* 771 * perform kernel, copy device memory into host memory and verify each 772 * device kernel output 773 */ 774 775 /* Initialize memory on the device */ 776 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 777 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 778 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 779 780 STREAM_Copy_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 781 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 782 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 783 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 784 if (errorSTREAMkernel) { 785 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n");CHKERRQ(ierr); 786 exit(-2000); 787 } 788 789 /* Initialize memory on the device */ 790 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 791 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 792 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 793 794 STREAM_Copy_Optimized_double<<<dimGrid,dimBlock>>>(d_a, d_c, N); 795 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 796 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 797 errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 798 if (errorSTREAMkernel) { 799 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n");CHKERRQ(ierr); 800 exit(-3000); 801 } 802 803 /* Initialize memory on the device */ 804 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 805 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 806 807 STREAM_Scale_double<<<dimGrid,dimBlock>>>(d_b, d_c, scalar, N); 808 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 809 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 810 errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 811 if (errorSTREAMkernel) { 812 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n");CHKERRQ(ierr); 813 exit(-4000); 814 } 815 816 /* Initialize memory on the device */ 817 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 818 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 819 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 820 821 STREAM_Add_double<<<dimGrid,dimBlock>>>(d_a, d_b, d_c, N); 822 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 823 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 824 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 825 errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 826 if (errorSTREAMkernel) { 827 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n");CHKERRQ(ierr); 828 exit(-5000); 829 } 830 831 /* Initialize memory on the device */ 832 set_array_double<<<dimGrid,dimBlock>>>(d_a, 2., N); 833 set_array_double<<<dimGrid,dimBlock>>>(d_b, .5, N); 834 set_array_double<<<dimGrid,dimBlock>>>(d_c, .5, N); 835 836 STREAM_Triad_double<<<dimGrid,dimBlock>>>(d_b, d_c, d_a, scalar, N); 837 ierr = cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 838 ierr = cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 839 ierr = cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost);CHKERRQ(ierr); 840 errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 841 if (errorSTREAMkernel) { 842 ierr = PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n");CHKERRQ(ierr); 843 exit(-6000); 844 } 845 846 free(h_a); 847 free(h_b); 848 free(h_c); 849 } 850 /* continue from here */ 851 printResultsReadable(times,sizeof(double)); 852 853 /* Free memory on device */ 854 ierr = cudaFree(d_a);CHKERRQ(ierr); 855 ierr = cudaFree(d_b);CHKERRQ(ierr); 856 ierr = cudaFree(d_c);CHKERRQ(ierr); 857 858 PetscFunctionReturn(0); 859 } 860 861 /////////////////////////////////////////////////////////////////////////// 862 //Print Results to Screen and File 863 /////////////////////////////////////////////////////////////////////////// 864 PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) 865 { 866 PetscErrorCode ierr; 867 PetscInt j, k; 868 float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 869 float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 870 float mintime[8] = {1e30,1e30,1e30,1e30,1e30,1e30,1e30,1e30}; 871 // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 872 const float bytes_per_kernel[8] = { 873 2. * bsize * N, 874 2. * bsize * N, 875 2. * bsize * N, 876 2. * bsize * N, 877 3. * bsize * N, 878 3. * bsize * N, 879 3. * bsize * N, 880 3. * bsize * N 881 }; 882 double rate,irate; 883 int rank,size; 884 PetscFunctionBegin; 885 ierr = MPI_Comm_rank(MPI_COMM_WORLD,&rank);CHKERRMPI(ierr); 886 ierr = MPI_Comm_size(MPI_COMM_WORLD,&size);CHKERRMPI(ierr); 887 /* --- SUMMARY --- */ 888 for (k = 0; k < NTIMES; ++k) { 889 for (j = 0; j < 8; ++j) { 890 avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 891 mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 892 maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 893 } 894 } 895 for (j = 0; j < 8; ++j) { 896 avgtime[j] = avgtime[j]/(float)(NTIMES-1); 897 } 898 j = 7; 899 irate = 1.0E-06 * bytes_per_kernel[j]/mintime[j]; 900 ierr = MPI_Reduce(&irate,&rate,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 901 if (!rank) { 902 FILE *fd; 903 if (size == 1) { 904 printf("%d %11.4f Rate (MB/s)\n",size, rate); 905 fd = fopen("flops","w"); 906 fprintf(fd,"%g\n",rate); 907 fclose(fd); 908 } else { 909 double prate; 910 fd = fopen("flops","r"); 911 fscanf(fd,"%lg",&prate); 912 fclose(fd); 913 printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate/prate); 914 } 915 } 916 917 PetscFunctionReturn(0); 918 } 919