1403adfb6SMatthew G Knepley /* 2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA. 3403adfb6SMatthew G Knepley 4403adfb6SMatthew G Knepley COPY: a(i) = b(i) 5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i) 6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i) 7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i) 8403adfb6SMatthew G Knepley 9403adfb6SMatthew G Knepley It measures the memory system on the device. 1019816777SMark The implementation is in double precision with a single option. 11403adfb6SMatthew G Knepley 12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin 13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c 14403adfb6SMatthew G Knepley 15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation 16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010 17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010 18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011 19403adfb6SMatthew G Knepley 20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example. 21403adfb6SMatthew G Knepley */ 2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n"; 23403adfb6SMatthew G Knepley 24403adfb6SMatthew G Knepley #include <petscconf.h> 25403adfb6SMatthew G Knepley #include <petscsys.h> 26403adfb6SMatthew G Knepley #include <petsctime.h> 270e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h> 28403adfb6SMatthew G Knepley 2919816777SMark #define N 10000000 30403adfb6SMatthew G Knepley #define NTIMES 10 31403adfb6SMatthew G Knepley 32403adfb6SMatthew G Knepley #ifndef MIN 33403adfb6SMatthew G Knepley #define MIN(x, y) ((x) < (y) ? (x) : (y)) 34403adfb6SMatthew G Knepley #endif 35403adfb6SMatthew G Knepley #ifndef MAX 36403adfb6SMatthew G Knepley #define MAX(x, y) ((x) > (y) ? (x) : (y)) 37403adfb6SMatthew G Knepley #endif 38403adfb6SMatthew G Knepley 39403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f; 40caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16; 41403adfb6SMatthew G Knepley 420e6b6b59SJacob Faibussowitsch __global__ void set_array(float *a, float value, size_t len) { 43403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 44403adfb6SMatthew G Knepley while (idx < len) { 45403adfb6SMatthew G Knepley a[idx] = value; 46403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 47403adfb6SMatthew G Knepley } 48403adfb6SMatthew G Knepley } 49403adfb6SMatthew G Knepley 500e6b6b59SJacob Faibussowitsch __global__ void set_array_double(double *a, double value, size_t len) { 51caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 52caccb7e3SMatthew G Knepley while (idx < len) { 53caccb7e3SMatthew G Knepley a[idx] = value; 54caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 55caccb7e3SMatthew G Knepley } 56caccb7e3SMatthew G Knepley } 57caccb7e3SMatthew G Knepley 580e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy(float *a, float *b, size_t len) { 59403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 60403adfb6SMatthew G Knepley while (idx < len) { 61403adfb6SMatthew G Knepley b[idx] = a[idx]; 62403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 63403adfb6SMatthew G Knepley } 64403adfb6SMatthew G Knepley } 65403adfb6SMatthew G Knepley 660e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_double(double *a, double *b, size_t len) { 67caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 68caccb7e3SMatthew G Knepley while (idx < len) { 69caccb7e3SMatthew G Knepley b[idx] = a[idx]; 70caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 71caccb7e3SMatthew G Knepley } 72caccb7e3SMatthew G Knepley } 73caccb7e3SMatthew G Knepley 740e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len) { 75403adfb6SMatthew G Knepley /* 76403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 77403adfb6SMatthew G Knepley * vector index space else return. 78403adfb6SMatthew G Knepley */ 79403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 80403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 81403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 82403adfb6SMatthew G Knepley } 83403adfb6SMatthew G Knepley 840e6b6b59SJacob Faibussowitsch __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len) { 85caccb7e3SMatthew G Knepley /* 86caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 87caccb7e3SMatthew G Knepley * vector index space else return. 88caccb7e3SMatthew G Knepley */ 89caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 90caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 91caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx]; 92caccb7e3SMatthew G Knepley } 93caccb7e3SMatthew G Knepley 940e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len) { 95403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 96403adfb6SMatthew G Knepley while (idx < len) { 97403adfb6SMatthew G Knepley b[idx] = scale * a[idx]; 98403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 99403adfb6SMatthew G Knepley } 100403adfb6SMatthew G Knepley } 101403adfb6SMatthew G Knepley 1020e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len) { 103caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 104caccb7e3SMatthew G Knepley while (idx < len) { 105caccb7e3SMatthew G Knepley b[idx] = scale * a[idx]; 106caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 107caccb7e3SMatthew G Knepley } 108caccb7e3SMatthew G Knepley } 109caccb7e3SMatthew G Knepley 1100e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len) { 111caccb7e3SMatthew G Knepley /* 112caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 113caccb7e3SMatthew G Knepley * vector index space else return. 114caccb7e3SMatthew G Knepley */ 115caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 116caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 117caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale * a[idx]; 118caccb7e3SMatthew G Knepley } 119caccb7e3SMatthew G Knepley 1200e6b6b59SJacob Faibussowitsch __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len) { 121caccb7e3SMatthew G Knepley /* 122caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 123caccb7e3SMatthew G Knepley * vector index space else return. 124caccb7e3SMatthew G Knepley */ 125caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 126caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 127caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale * a[idx]; 128caccb7e3SMatthew G Knepley } 129caccb7e3SMatthew G Knepley 1300e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add(float *a, float *b, float *c, size_t len) { 131403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 132403adfb6SMatthew G Knepley while (idx < len) { 133403adfb6SMatthew G Knepley c[idx] = a[idx] + b[idx]; 134403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 135403adfb6SMatthew G Knepley } 136403adfb6SMatthew G Knepley } 137403adfb6SMatthew G Knepley 1380e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len) { 139caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 140caccb7e3SMatthew G Knepley while (idx < len) { 141caccb7e3SMatthew G Knepley c[idx] = a[idx] + b[idx]; 142caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 143caccb7e3SMatthew G Knepley } 144caccb7e3SMatthew G Knepley } 145caccb7e3SMatthew G Knepley 1460e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len) { 147caccb7e3SMatthew G Knepley /* 148caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 149caccb7e3SMatthew G Knepley * vector index space else return. 150caccb7e3SMatthew G Knepley */ 151caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 152caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 153caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + b[idx]; 154caccb7e3SMatthew G Knepley } 155caccb7e3SMatthew G Knepley 1560e6b6b59SJacob Faibussowitsch __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len) { 157caccb7e3SMatthew G Knepley /* 158caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 159caccb7e3SMatthew G Knepley * vector index space else return. 160caccb7e3SMatthew G Knepley */ 161caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 162caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 163caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + b[idx]; 164caccb7e3SMatthew G Knepley } 165caccb7e3SMatthew G Knepley 1660e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len) { 167403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 168403adfb6SMatthew G Knepley while (idx < len) { 169403adfb6SMatthew G Knepley c[idx] = a[idx] + scalar * b[idx]; 170403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x; 171403adfb6SMatthew G Knepley } 172403adfb6SMatthew G Knepley } 173403adfb6SMatthew G Knepley 1740e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len) { 175caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 176caccb7e3SMatthew G Knepley while (idx < len) { 177caccb7e3SMatthew G Knepley c[idx] = a[idx] + scalar * b[idx]; 178caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x; 179caccb7e3SMatthew G Knepley } 180caccb7e3SMatthew G Knepley } 181caccb7e3SMatthew G Knepley 1820e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len) { 183caccb7e3SMatthew G Knepley /* 184caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 185caccb7e3SMatthew G Knepley * vector index space else return. 186caccb7e3SMatthew G Knepley */ 187caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 188caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 189caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 190caccb7e3SMatthew G Knepley } 191caccb7e3SMatthew G Knepley 1920e6b6b59SJacob Faibussowitsch __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len) { 193caccb7e3SMatthew G Knepley /* 194caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than 195caccb7e3SMatthew G Knepley * vector index space else return. 196caccb7e3SMatthew G Knepley */ 197caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return; 198caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x; 199caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + scalar * b[idx]; 200caccb7e3SMatthew G Knepley } 201caccb7e3SMatthew G Knepley 202403adfb6SMatthew G Knepley /* Host side verification routines */ 2030e6b6b59SJacob Faibussowitsch bool STREAM_Copy_verify(float *a, float *b, size_t len) { 204403adfb6SMatthew G Knepley size_t idx; 205403adfb6SMatthew G Knepley bool bDifferent = false; 206403adfb6SMatthew G Knepley 207403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 208403adfb6SMatthew G Knepley float expectedResult = a[idx]; 209403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 210403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 211403adfb6SMatthew G Knepley /* element-wise relative error determination */ 212403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 213403adfb6SMatthew G Knepley } 214403adfb6SMatthew G Knepley 215403adfb6SMatthew G Knepley return bDifferent; 216403adfb6SMatthew G Knepley } 217403adfb6SMatthew G Knepley 2180e6b6b59SJacob Faibussowitsch bool STREAM_Copy_verify_double(double *a, double *b, size_t len) { 219caccb7e3SMatthew G Knepley size_t idx; 220caccb7e3SMatthew G Knepley bool bDifferent = false; 221caccb7e3SMatthew G Knepley 222caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 223caccb7e3SMatthew G Knepley double expectedResult = a[idx]; 224caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 22519816777SMark double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps; 226caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 227caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 228caccb7e3SMatthew G Knepley } 229caccb7e3SMatthew G Knepley 230caccb7e3SMatthew G Knepley return bDifferent; 231caccb7e3SMatthew G Knepley } 232caccb7e3SMatthew G Knepley 2330e6b6b59SJacob Faibussowitsch bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len) { 234403adfb6SMatthew G Knepley size_t idx; 235403adfb6SMatthew G Knepley bool bDifferent = false; 236403adfb6SMatthew G Knepley 237403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 238403adfb6SMatthew G Knepley float expectedResult = scale * a[idx]; 239403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult); 240403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 241403adfb6SMatthew G Knepley /* element-wise relative error determination */ 242403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 243403adfb6SMatthew G Knepley } 244403adfb6SMatthew G Knepley 245403adfb6SMatthew G Knepley return bDifferent; 246403adfb6SMatthew G Knepley } 247403adfb6SMatthew G Knepley 2480e6b6b59SJacob Faibussowitsch bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len) { 249caccb7e3SMatthew G Knepley size_t idx; 250caccb7e3SMatthew G Knepley bool bDifferent = false; 251caccb7e3SMatthew G Knepley 252caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 253caccb7e3SMatthew G Knepley double expectedResult = scale * a[idx]; 254caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult); 255caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 256caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 257caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 258caccb7e3SMatthew G Knepley } 259caccb7e3SMatthew G Knepley 260caccb7e3SMatthew G Knepley return bDifferent; 261caccb7e3SMatthew G Knepley } 262caccb7e3SMatthew G Knepley 2630e6b6b59SJacob Faibussowitsch bool STREAM_Add_verify(float *a, float *b, float *c, size_t len) { 264403adfb6SMatthew G Knepley size_t idx; 265403adfb6SMatthew G Knepley bool bDifferent = false; 266403adfb6SMatthew G Knepley 267403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 268403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx]; 269403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 270403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 271403adfb6SMatthew G Knepley /* element-wise relative error determination */ 272403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f); 273403adfb6SMatthew G Knepley } 274403adfb6SMatthew G Knepley 275403adfb6SMatthew G Knepley return bDifferent; 276403adfb6SMatthew G Knepley } 277403adfb6SMatthew G Knepley 2780e6b6b59SJacob Faibussowitsch bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len) { 279caccb7e3SMatthew G Knepley size_t idx; 280caccb7e3SMatthew G Knepley bool bDifferent = false; 281caccb7e3SMatthew G Knepley 282caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 283caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx]; 284caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 285caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 286caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 287caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.); 288caccb7e3SMatthew G Knepley } 289caccb7e3SMatthew G Knepley 290caccb7e3SMatthew G Knepley return bDifferent; 291caccb7e3SMatthew G Knepley } 292caccb7e3SMatthew G Knepley 2930e6b6b59SJacob Faibussowitsch bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len) { 294403adfb6SMatthew G Knepley size_t idx; 295403adfb6SMatthew G Knepley bool bDifferent = false; 296403adfb6SMatthew G Knepley 297403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 298403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar * b[idx]; 299403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult); 300403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 301403adfb6SMatthew G Knepley /* element-wise relative error determination */ 302403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f); 303403adfb6SMatthew G Knepley } 304403adfb6SMatthew G Knepley 305403adfb6SMatthew G Knepley return bDifferent; 306403adfb6SMatthew G Knepley } 307403adfb6SMatthew G Knepley 3080e6b6b59SJacob Faibussowitsch bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len) { 309caccb7e3SMatthew G Knepley size_t idx; 310caccb7e3SMatthew G Knepley bool bDifferent = false; 311caccb7e3SMatthew G Knepley 312caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) { 313caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar * b[idx]; 314caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult); 315caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps; 316caccb7e3SMatthew G Knepley /* element-wise relative error determination */ 317caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.); 318caccb7e3SMatthew G Knepley } 319caccb7e3SMatthew G Knepley 320caccb7e3SMatthew G Knepley return bDifferent; 321caccb7e3SMatthew G Knepley } 322caccb7e3SMatthew G Knepley 323403adfb6SMatthew G Knepley /* forward declarations */ 324caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming); 325403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 326caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming); 32719816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t); 328403adfb6SMatthew G Knepley 3290e6b6b59SJacob Faibussowitsch int main(int argc, char *argv[]) { 330403adfb6SMatthew G Knepley PetscInt device = 0; 33119816777SMark PetscBool runDouble = PETSC_TRUE; 33219816777SMark const PetscBool cpuTiming = PETSC_TRUE; // must be true 333403adfb6SMatthew G Knepley PetscErrorCode ierr; 334403adfb6SMatthew G Knepley 3359566063dSJacob Faibussowitsch PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync)); 33619816777SMark 3379566063dSJacob Faibussowitsch PetscCall(PetscInitialize(&argc, &argv, 0, help)); 338403adfb6SMatthew G Knepley 339d0609cedSBarry Smith PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM"); 3409566063dSJacob Faibussowitsch PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0)); 3419566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL)); 342d0609cedSBarry Smith PetscOptionsEnd(); 343403adfb6SMatthew G Knepley 344caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming); 3450e6b6b59SJacob Faibussowitsch if (ierr) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED")); } 3469566063dSJacob Faibussowitsch PetscCall(PetscFinalize()); 347b122ec5aSJacob Faibussowitsch return 0; 348403adfb6SMatthew G Knepley } 349403adfb6SMatthew G Knepley 350403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 351403adfb6SMatthew G Knepley //Run the appropriate tests 352403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////////// 3530e6b6b59SJacob Faibussowitsch PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming) { 354403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128; 355403adfb6SMatthew G Knepley 356403adfb6SMatthew G Knepley PetscFunctionBegin; 357403adfb6SMatthew G Knepley // Check device 358403adfb6SMatthew G Knepley { 359403adfb6SMatthew G Knepley int deviceCount; 360403adfb6SMatthew G Knepley 361403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount); 362403adfb6SMatthew G Knepley if (deviceCount == 0) { 3639566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n")); 364403adfb6SMatthew G Knepley return -1000; 365403adfb6SMatthew G Knepley } 366403adfb6SMatthew G Knepley 367403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) { 3689566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0)); 369403adfb6SMatthew G Knepley deviceNum = 0; 370403adfb6SMatthew G Knepley } 371403adfb6SMatthew G Knepley } 372403adfb6SMatthew G Knepley 373403adfb6SMatthew G Knepley cudaSetDevice(deviceNum); 3749566063dSJacob Faibussowitsch // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n")); 375403adfb6SMatthew G Knepley cudaDeviceProp deviceProp; 37619816777SMark if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) { 3779566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n")); 378403adfb6SMatthew G Knepley return -1; 379403adfb6SMatthew G Knepley } 380403adfb6SMatthew G Knepley 381caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) { 3829566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n")); 383caccb7e3SMatthew G Knepley return -1; 384caccb7e3SMatthew G Knepley } 3856f2b61bcSKarl Rupp if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */ 3866f2b61bcSKarl Rupp else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */ 387403adfb6SMatthew G Knepley 3881baa6e33SBarry Smith if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming)); 3891baa6e33SBarry Smith else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming)); 3903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 391403adfb6SMatthew G Knepley } 392403adfb6SMatthew G Knepley 393403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 394403adfb6SMatthew G Knepley // runStream 395403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 3960e6b6b59SJacob Faibussowitsch PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) { 397403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c; 398403adfb6SMatthew G Knepley int k; 399caccb7e3SMatthew G Knepley float times[8][NTIMES]; 400403adfb6SMatthew G Knepley float scalar; 401403adfb6SMatthew G Knepley 402403adfb6SMatthew G Knepley PetscFunctionBegin; 403403adfb6SMatthew G Knepley /* Allocate memory on device */ 4049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N)); 4059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N)); 4069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N)); 407403adfb6SMatthew G Knepley 408403adfb6SMatthew G Knepley /* Compute execution configuration */ 409403adfb6SMatthew G Knepley 410403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 411403adfb6SMatthew G Knepley dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 412403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x += 1; 413403adfb6SMatthew G Knepley 414403adfb6SMatthew G Knepley /* Initialize memory on the device */ 415403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 416403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 417403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 418403adfb6SMatthew G Knepley 419403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 420403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 421403adfb6SMatthew G Knepley 422403adfb6SMatthew G Knepley scalar = 3.0f; 423403adfb6SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 4248563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 425403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 42619816777SMark cudaStreamSynchronize(NULL); 4279566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 4288563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 42919816777SMark if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec 430403adfb6SMatthew G Knepley 431403adfb6SMatthew G Knepley cpuTimer = 0.0; 4328563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 433403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 43419816777SMark cudaStreamSynchronize(NULL); 4359566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 436df3898eeSBarry Smith //get the total elapsed time in ms 4378563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 43819816777SMark if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3; 439403adfb6SMatthew G Knepley 440403adfb6SMatthew G Knepley cpuTimer = 0.0; 4418563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 442403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 44319816777SMark cudaStreamSynchronize(NULL); 4449566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 445df3898eeSBarry Smith //get the total elapsed time in ms 4468563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 44719816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 448403adfb6SMatthew G Knepley 449403adfb6SMatthew G Knepley cpuTimer = 0.0; 4508563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 451caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 45219816777SMark cudaStreamSynchronize(NULL); 4539566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 454df3898eeSBarry Smith //get the total elapsed time in ms 4558563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 45619816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 457403adfb6SMatthew G Knepley 458403adfb6SMatthew G Knepley cpuTimer = 0.0; 4598563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 4609566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventRecord(start, 0)); 461caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 46219816777SMark cudaStreamSynchronize(NULL); 4639566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 4649566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventRecord(stop, 0)); 4659566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventSynchronize(stop)); 466df3898eeSBarry Smith //get the total elapsed time in ms 4678563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 46819816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 4696f2b61bcSKarl Rupp else { 4709566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop)); 471403adfb6SMatthew G Knepley } 472403adfb6SMatthew G Knepley 473caccb7e3SMatthew G Knepley cpuTimer = 0.0; 4748563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 475caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 47619816777SMark cudaStreamSynchronize(NULL); 4779566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 478df3898eeSBarry Smith //get the total elapsed time in ms 4798563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 48019816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 481caccb7e3SMatthew G Knepley 482caccb7e3SMatthew G Knepley cpuTimer = 0.0; 4838563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 484caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 48519816777SMark cudaStreamSynchronize(NULL); 4869566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 487df3898eeSBarry Smith //get the total elapsed time in ms 4888563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 48919816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 490caccb7e3SMatthew G Knepley 491caccb7e3SMatthew G Knepley cpuTimer = 0.0; 4928563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 493caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 49419816777SMark cudaStreamSynchronize(NULL); 4959566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 496df3898eeSBarry Smith //get the total elapsed time in ms 4978563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 49819816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 499caccb7e3SMatthew G Knepley } 500caccb7e3SMatthew G Knepley 50119816777SMark if (1) { /* verify kernels */ 502403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c; 503403adfb6SMatthew G Knepley bool errorSTREAMkernel = true; 504403adfb6SMatthew G Knepley 505403adfb6SMatthew G Knepley if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 506403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 507403adfb6SMatthew G Knepley exit(1); 508403adfb6SMatthew G Knepley } 509403adfb6SMatthew G Knepley if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 510403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 511403adfb6SMatthew G Knepley exit(1); 512403adfb6SMatthew G Knepley } 513403adfb6SMatthew G Knepley 514403adfb6SMatthew G Knepley if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) { 515403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 516403adfb6SMatthew G Knepley exit(1); 517403adfb6SMatthew G Knepley } 518403adfb6SMatthew G Knepley 519403adfb6SMatthew G Knepley /* 520403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 521403adfb6SMatthew G Knepley * device kernel output 522403adfb6SMatthew G Knepley */ 523403adfb6SMatthew G Knepley 524403adfb6SMatthew G Knepley /* Initialize memory on the device */ 525403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 526403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 527403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 528403adfb6SMatthew G Knepley 529403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N); 5309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 532403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 533403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5349566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 535403adfb6SMatthew G Knepley exit(-2000); 536403adfb6SMatthew G Knepley } 537403adfb6SMatthew G Knepley 538403adfb6SMatthew G Knepley /* Initialize memory on the device */ 539403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 540403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 541403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 542403adfb6SMatthew G Knepley 543403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N); 5449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 546403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N); 547403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5489566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 549403adfb6SMatthew G Knepley exit(-3000); 550403adfb6SMatthew G Knepley } 551403adfb6SMatthew G Knepley 552403adfb6SMatthew G Knepley /* Initialize memory on the device */ 55319816777SMark set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 554403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 555403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 556403adfb6SMatthew G Knepley 557403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 5589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 560403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N); 561403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5629566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 563403adfb6SMatthew G Knepley exit(-4000); 564403adfb6SMatthew G Knepley } 565403adfb6SMatthew G Knepley 566403adfb6SMatthew G Knepley /* Initialize memory on the device */ 567403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 568403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 569403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 570403adfb6SMatthew G Knepley 571403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 5729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 575403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N); 576403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5779566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 578403adfb6SMatthew G Knepley exit(-5000); 579403adfb6SMatthew G Knepley } 580403adfb6SMatthew G Knepley 581403adfb6SMatthew G Knepley /* Initialize memory on the device */ 582403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N); 583403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N); 584403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N); 585403adfb6SMatthew G Knepley 586403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 5879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost)); 5899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost)); 590403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N); 591403adfb6SMatthew G Knepley if (errorSTREAMkernel) { 5929566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 593403adfb6SMatthew G Knepley exit(-6000); 594403adfb6SMatthew G Knepley } 595403adfb6SMatthew G Knepley 59619816777SMark free(h_a); 59719816777SMark free(h_b); 59819816777SMark free(h_c); 59919816777SMark } 600403adfb6SMatthew G Knepley /* continue from here */ 60119816777SMark printResultsReadable(times, sizeof(float)); 602403adfb6SMatthew G Knepley 603403adfb6SMatthew G Knepley /* Free memory on device */ 6049566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_a)); 6059566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_b)); 6069566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_c)); 6073ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 608403adfb6SMatthew G Knepley } 609403adfb6SMatthew G Knepley 6100e6b6b59SJacob Faibussowitsch PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming) { 611caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c; 612caccb7e3SMatthew G Knepley int k; 613caccb7e3SMatthew G Knepley float times[8][NTIMES]; 614caccb7e3SMatthew G Knepley double scalar; 615caccb7e3SMatthew G Knepley 616caccb7e3SMatthew G Knepley PetscFunctionBegin; 617caccb7e3SMatthew G Knepley /* Allocate memory on device */ 6189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N)); 6199566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N)); 6209566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N)); 621caccb7e3SMatthew G Knepley 622caccb7e3SMatthew G Knepley /* Compute execution configuration */ 623caccb7e3SMatthew G Knepley 624caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */ 625caccb7e3SMatthew G Knepley dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */ 626caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x += 1; 627caccb7e3SMatthew G Knepley 628caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 629caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 630caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 631caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 632caccb7e3SMatthew G Knepley 633caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 634caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0; 635caccb7e3SMatthew G Knepley 636caccb7e3SMatthew G Knepley scalar = 3.0; 637caccb7e3SMatthew G Knepley for (k = 0; k < NTIMES; ++k) { 6388563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 639caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 64019816777SMark cudaStreamSynchronize(NULL); 6419566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 642df3898eeSBarry Smith //get the total elapsed time in ms 643caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6448563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 64519816777SMark times[0][k] = cpuTimer * 1.e3; 646caccb7e3SMatthew G Knepley } 647caccb7e3SMatthew G Knepley 648caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6498563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 650caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 65119816777SMark cudaStreamSynchronize(NULL); 6529566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 653df3898eeSBarry Smith //get the total elapsed time in ms 654caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) { 6558563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 65619816777SMark times[1][k] = cpuTimer * 1.e3; 657caccb7e3SMatthew G Knepley } 658caccb7e3SMatthew G Knepley 659caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6608563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 661caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 66219816777SMark cudaStreamSynchronize(NULL); 6639566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 664df3898eeSBarry Smith //get the total elapsed time in ms 6658563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 66619816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3; 667caccb7e3SMatthew G Knepley 668caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6698563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 670caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 67119816777SMark cudaStreamSynchronize(NULL); 6729566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 673df3898eeSBarry Smith //get the total elapsed time in ms 6748563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 67519816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3; 676caccb7e3SMatthew G Knepley 677caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6788563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 679caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 68019816777SMark cudaStreamSynchronize(NULL); 6819566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 682df3898eeSBarry Smith //get the total elapsed time in ms 6838563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 68419816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3; 685caccb7e3SMatthew G Knepley 686caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6878563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 688caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 68919816777SMark cudaStreamSynchronize(NULL); 6909566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 691df3898eeSBarry Smith //get the total elapsed time in ms 6928563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 69319816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3; 694caccb7e3SMatthew G Knepley 695caccb7e3SMatthew G Knepley cpuTimer = 0.0; 6968563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 697caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 69819816777SMark cudaStreamSynchronize(NULL); 6999566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 700df3898eeSBarry Smith //get the total elapsed time in ms 7018563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 70219816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3; 703caccb7e3SMatthew G Knepley 704caccb7e3SMatthew G Knepley cpuTimer = 0.0; 7058563dfccSBarry Smith PetscTimeSubtract(&cpuTimer); 706caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 70719816777SMark cudaStreamSynchronize(NULL); 7089566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD)); 709df3898eeSBarry Smith //get the total elapsed time in ms 7108563dfccSBarry Smith PetscTimeAdd(&cpuTimer); 71119816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3; 712caccb7e3SMatthew G Knepley } 713caccb7e3SMatthew G Knepley 71419816777SMark if (1) { /* verify kernels */ 715caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c; 716caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true; 717caccb7e3SMatthew G Knepley 718caccb7e3SMatthew G Knepley if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 719caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n"); 720caccb7e3SMatthew G Knepley exit(1); 721caccb7e3SMatthew G Knepley } 722caccb7e3SMatthew G Knepley if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 723caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n"); 724caccb7e3SMatthew G Knepley exit(1); 725caccb7e3SMatthew G Knepley } 726caccb7e3SMatthew G Knepley 727caccb7e3SMatthew G Knepley if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) { 728caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n"); 729caccb7e3SMatthew G Knepley exit(1); 730caccb7e3SMatthew G Knepley } 731caccb7e3SMatthew G Knepley 732caccb7e3SMatthew G Knepley /* 733caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each 734caccb7e3SMatthew G Knepley * device kernel output 735caccb7e3SMatthew G Knepley */ 736caccb7e3SMatthew G Knepley 737caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 738caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 739caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 740caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 741caccb7e3SMatthew G Knepley 742caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 7439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 745caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 746caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 7479566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n")); 748caccb7e3SMatthew G Knepley exit(-2000); 749caccb7e3SMatthew G Knepley } 750caccb7e3SMatthew G Knepley 751caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 752caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 753caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 754caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 755caccb7e3SMatthew G Knepley 756caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N); 7579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 759caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N); 760caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 7619566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n")); 762caccb7e3SMatthew G Knepley exit(-3000); 763caccb7e3SMatthew G Knepley } 764caccb7e3SMatthew G Knepley 765caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 766caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 767caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 768caccb7e3SMatthew G Knepley 769caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N); 7709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 772caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N); 773caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 7749566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n")); 775caccb7e3SMatthew G Knepley exit(-4000); 776caccb7e3SMatthew G Knepley } 777caccb7e3SMatthew G Knepley 778caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 779caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 780caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 781caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 782caccb7e3SMatthew G Knepley 783caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N); 7849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 7869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 787caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N); 788caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 7899566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n")); 790caccb7e3SMatthew G Knepley exit(-5000); 791caccb7e3SMatthew G Knepley } 792caccb7e3SMatthew G Knepley 793caccb7e3SMatthew G Knepley /* Initialize memory on the device */ 794caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N); 795caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N); 796caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N); 797caccb7e3SMatthew G Knepley 798caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N); 7999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost)); 8019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost)); 802caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N); 803caccb7e3SMatthew G Knepley if (errorSTREAMkernel) { 8049566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n")); 805caccb7e3SMatthew G Knepley exit(-6000); 806caccb7e3SMatthew G Knepley } 807caccb7e3SMatthew G Knepley 80819816777SMark free(h_a); 80919816777SMark free(h_b); 81019816777SMark free(h_c); 81119816777SMark } 812caccb7e3SMatthew G Knepley /* continue from here */ 81319816777SMark printResultsReadable(times, sizeof(double)); 814caccb7e3SMatthew G Knepley 815caccb7e3SMatthew G Knepley /* Free memory on device */ 8169566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_a)); 8179566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_b)); 8189566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_c)); 8193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 820caccb7e3SMatthew G Knepley } 821caccb7e3SMatthew G Knepley 822403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 823403adfb6SMatthew G Knepley //Print Results to Screen and File 824403adfb6SMatthew G Knepley /////////////////////////////////////////////////////////////////////////// 8250e6b6b59SJacob Faibussowitsch PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize) { 826403adfb6SMatthew G Knepley PetscErrorCode ierr; 827403adfb6SMatthew G Knepley PetscInt j, k; 828caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 829caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.}; 830caccb7e3SMatthew G Knepley float mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30}; 83119816777SMark // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "}; 8320e6b6b59SJacob Faibussowitsch const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N}; 83319816777SMark double rate, irate; 83419816777SMark int rank, size; 835*4d86920dSPierre Jolivet 836403adfb6SMatthew G Knepley PetscFunctionBegin; 8379566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank)); 8389566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size)); 839403adfb6SMatthew G Knepley /* --- SUMMARY --- */ 84019816777SMark for (k = 0; k < NTIMES; ++k) { 841caccb7e3SMatthew G Knepley for (j = 0; j < 8; ++j) { 84219816777SMark avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec 843403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k])); 844403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k])); 845403adfb6SMatthew G Knepley } 84619816777SMark } 8470e6b6b59SJacob Faibussowitsch for (j = 0; j < 8; ++j) { avgtime[j] = avgtime[j] / (float)(NTIMES - 1); } 84819816777SMark j = 7; 84919816777SMark irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j]; 85019816777SMark ierr = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); 851dd400576SPatrick Sanan if (rank == 0) { 85219816777SMark FILE *fd; 85319816777SMark if (size == 1) { 85419816777SMark printf("%d %11.4f Rate (MB/s)\n", size, rate); 85519816777SMark fd = fopen("flops", "w"); 85619816777SMark fprintf(fd, "%g\n", rate); 85719816777SMark fclose(fd); 85819816777SMark } else { 85919816777SMark double prate; 86019816777SMark fd = fopen("flops", "r"); 86119816777SMark fscanf(fd, "%lg", &prate); 86219816777SMark fclose(fd); 86319816777SMark printf("%d %11.4f Rate (MB/s) %g \n", size, rate, rate / prate); 86419816777SMark } 86519816777SMark } 8663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 867403adfb6SMatthew G Knepley } 868