1403adfb6SMatthew G Knepley /*
2403adfb6SMatthew G Knepley STREAM benchmark implementation in CUDA.
3403adfb6SMatthew G Knepley
4403adfb6SMatthew G Knepley COPY: a(i) = b(i)
5403adfb6SMatthew G Knepley SCALE: a(i) = q*b(i)
6403adfb6SMatthew G Knepley SUM: a(i) = b(i) + c(i)
7403adfb6SMatthew G Knepley TRIAD: a(i) = b(i) + q*c(i)
8403adfb6SMatthew G Knepley
9403adfb6SMatthew G Knepley It measures the memory system on the device.
1019816777SMark The implementation is in double precision with a single option.
11403adfb6SMatthew G Knepley
12403adfb6SMatthew G Knepley Code based on the code developed by John D. McCalpin
13403adfb6SMatthew G Knepley http://www.cs.virginia.edu/stream/FTP/Code/stream.c
14403adfb6SMatthew G Knepley
15403adfb6SMatthew G Knepley Written by: Massimiliano Fatica, NVIDIA Corporation
16403adfb6SMatthew G Knepley Modified by: Douglas Enright (dpephd-nvidia@yahoo.com), 1 December 2010
17403adfb6SMatthew G Knepley Extensive Revisions, 4 December 2010
18403adfb6SMatthew G Knepley Modified for PETSc by: Matthew G. Knepley 14 Aug 2011
19403adfb6SMatthew G Knepley
20403adfb6SMatthew G Knepley User interface motivated by bandwidthTest NVIDIA SDK example.
21403adfb6SMatthew G Knepley */
2219816777SMark static char help[] = "Double-Precision STREAM Benchmark implementation in CUDA\n Performs Copy, Scale, Add, and Triad double-precision kernels\n\n";
23403adfb6SMatthew G Knepley
24403adfb6SMatthew G Knepley #include <petscconf.h>
25403adfb6SMatthew G Knepley #include <petscsys.h>
26403adfb6SMatthew G Knepley #include <petsctime.h>
270e6b6b59SJacob Faibussowitsch #include <petscdevice_cuda.h>
28403adfb6SMatthew G Knepley
2919816777SMark #define N 10000000
30403adfb6SMatthew G Knepley #define NTIMES 10
31403adfb6SMatthew G Knepley
32*beceaeb6SBarry Smith #if !defined(MIN)
33403adfb6SMatthew G Knepley #define MIN(x, y) ((x) < (y) ? (x) : (y))
34403adfb6SMatthew G Knepley #endif
35*beceaeb6SBarry Smith #if !defined(MAX)
36403adfb6SMatthew G Knepley #define MAX(x, y) ((x) > (y) ? (x) : (y))
37403adfb6SMatthew G Knepley #endif
38403adfb6SMatthew G Knepley
39403adfb6SMatthew G Knepley const float flt_eps = 1.192092896e-07f;
40caccb7e3SMatthew G Knepley const double dbl_eps = 2.2204460492503131e-16;
41403adfb6SMatthew G Knepley
set_array(float * a,float value,size_t len)4267595998SJunchao Zhang __global__ void set_array(float *a, float value, size_t len)
4367595998SJunchao Zhang {
44403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
45403adfb6SMatthew G Knepley while (idx < len) {
46403adfb6SMatthew G Knepley a[idx] = value;
47403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x;
48403adfb6SMatthew G Knepley }
49403adfb6SMatthew G Knepley }
50403adfb6SMatthew G Knepley
set_array_double(double * a,double value,size_t len)5167595998SJunchao Zhang __global__ void set_array_double(double *a, double value, size_t len)
5267595998SJunchao Zhang {
53caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
54caccb7e3SMatthew G Knepley while (idx < len) {
55caccb7e3SMatthew G Knepley a[idx] = value;
56caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x;
57caccb7e3SMatthew G Knepley }
58caccb7e3SMatthew G Knepley }
59caccb7e3SMatthew G Knepley
STREAM_Copy(float * a,float * b,size_t len)6067595998SJunchao Zhang __global__ void STREAM_Copy(float *a, float *b, size_t len)
6167595998SJunchao Zhang {
62403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
63403adfb6SMatthew G Knepley while (idx < len) {
64403adfb6SMatthew G Knepley b[idx] = a[idx];
65403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x;
66403adfb6SMatthew G Knepley }
67403adfb6SMatthew G Knepley }
68403adfb6SMatthew G Knepley
STREAM_Copy_double(double * a,double * b,size_t len)6967595998SJunchao Zhang __global__ void STREAM_Copy_double(double *a, double *b, size_t len)
7067595998SJunchao Zhang {
71caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
72caccb7e3SMatthew G Knepley while (idx < len) {
73caccb7e3SMatthew G Knepley b[idx] = a[idx];
74caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x;
75caccb7e3SMatthew G Knepley }
76caccb7e3SMatthew G Knepley }
77caccb7e3SMatthew G Knepley
STREAM_Copy_Optimized(float * a,float * b,size_t len)7867595998SJunchao Zhang __global__ void STREAM_Copy_Optimized(float *a, float *b, size_t len)
7967595998SJunchao Zhang {
80403adfb6SMatthew G Knepley /*
81403adfb6SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
82403adfb6SMatthew G Knepley * vector index space else return.
83403adfb6SMatthew G Knepley */
84403adfb6SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
85403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
86403adfb6SMatthew G Knepley if (idx < len) b[idx] = a[idx];
87403adfb6SMatthew G Knepley }
88403adfb6SMatthew G Knepley
STREAM_Copy_Optimized_double(double * a,double * b,size_t len)8967595998SJunchao Zhang __global__ void STREAM_Copy_Optimized_double(double *a, double *b, size_t len)
9067595998SJunchao Zhang {
91caccb7e3SMatthew G Knepley /*
92caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
93caccb7e3SMatthew G Knepley * vector index space else return.
94caccb7e3SMatthew G Knepley */
95caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
96caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
97caccb7e3SMatthew G Knepley if (idx < len) b[idx] = a[idx];
98caccb7e3SMatthew G Knepley }
99caccb7e3SMatthew G Knepley
STREAM_Scale(float * a,float * b,float scale,size_t len)10067595998SJunchao Zhang __global__ void STREAM_Scale(float *a, float *b, float scale, size_t len)
10167595998SJunchao Zhang {
102403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
103403adfb6SMatthew G Knepley while (idx < len) {
104403adfb6SMatthew G Knepley b[idx] = scale * a[idx];
105403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x;
106403adfb6SMatthew G Knepley }
107403adfb6SMatthew G Knepley }
108403adfb6SMatthew G Knepley
STREAM_Scale_double(double * a,double * b,double scale,size_t len)10967595998SJunchao Zhang __global__ void STREAM_Scale_double(double *a, double *b, double scale, size_t len)
11067595998SJunchao Zhang {
111caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
112caccb7e3SMatthew G Knepley while (idx < len) {
113caccb7e3SMatthew G Knepley b[idx] = scale * a[idx];
114caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x;
115caccb7e3SMatthew G Knepley }
116caccb7e3SMatthew G Knepley }
117caccb7e3SMatthew G Knepley
STREAM_Scale_Optimized(float * a,float * b,float scale,size_t len)11867595998SJunchao Zhang __global__ void STREAM_Scale_Optimized(float *a, float *b, float scale, size_t len)
11967595998SJunchao Zhang {
120caccb7e3SMatthew G Knepley /*
121caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
122caccb7e3SMatthew G Knepley * vector index space else return.
123caccb7e3SMatthew G Knepley */
124caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
125caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
126caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale * a[idx];
127caccb7e3SMatthew G Knepley }
128caccb7e3SMatthew G Knepley
STREAM_Scale_Optimized_double(double * a,double * b,double scale,size_t len)12967595998SJunchao Zhang __global__ void STREAM_Scale_Optimized_double(double *a, double *b, double scale, size_t len)
13067595998SJunchao Zhang {
131caccb7e3SMatthew G Knepley /*
132caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
133caccb7e3SMatthew G Knepley * vector index space else return.
134caccb7e3SMatthew G Knepley */
135caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
136caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
137caccb7e3SMatthew G Knepley if (idx < len) b[idx] = scale * a[idx];
138caccb7e3SMatthew G Knepley }
139caccb7e3SMatthew G Knepley
STREAM_Add(float * a,float * b,float * c,size_t len)14067595998SJunchao Zhang __global__ void STREAM_Add(float *a, float *b, float *c, size_t len)
14167595998SJunchao Zhang {
142403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
143403adfb6SMatthew G Knepley while (idx < len) {
144403adfb6SMatthew G Knepley c[idx] = a[idx] + b[idx];
145403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x;
146403adfb6SMatthew G Knepley }
147403adfb6SMatthew G Knepley }
148403adfb6SMatthew G Knepley
STREAM_Add_double(double * a,double * b,double * c,size_t len)14967595998SJunchao Zhang __global__ void STREAM_Add_double(double *a, double *b, double *c, size_t len)
15067595998SJunchao Zhang {
151caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
152caccb7e3SMatthew G Knepley while (idx < len) {
153caccb7e3SMatthew G Knepley c[idx] = a[idx] + b[idx];
154caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x;
155caccb7e3SMatthew G Knepley }
156caccb7e3SMatthew G Knepley }
157caccb7e3SMatthew G Knepley
STREAM_Add_Optimized(float * a,float * b,float * c,size_t len)15867595998SJunchao Zhang __global__ void STREAM_Add_Optimized(float *a, float *b, float *c, size_t len)
15967595998SJunchao Zhang {
160caccb7e3SMatthew G Knepley /*
161caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
162caccb7e3SMatthew G Knepley * vector index space else return.
163caccb7e3SMatthew G Knepley */
164caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
165caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
166caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + b[idx];
167caccb7e3SMatthew G Knepley }
168caccb7e3SMatthew G Knepley
STREAM_Add_Optimized_double(double * a,double * b,double * c,size_t len)16967595998SJunchao Zhang __global__ void STREAM_Add_Optimized_double(double *a, double *b, double *c, size_t len)
17067595998SJunchao Zhang {
171caccb7e3SMatthew G Knepley /*
172caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
173caccb7e3SMatthew G Knepley * vector index space else return.
174caccb7e3SMatthew G Knepley */
175caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
176caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
177caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + b[idx];
178caccb7e3SMatthew G Knepley }
179caccb7e3SMatthew G Knepley
STREAM_Triad(float * a,float * b,float * c,float scalar,size_t len)18067595998SJunchao Zhang __global__ void STREAM_Triad(float *a, float *b, float *c, float scalar, size_t len)
18167595998SJunchao Zhang {
182403adfb6SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
183403adfb6SMatthew G Knepley while (idx < len) {
184403adfb6SMatthew G Knepley c[idx] = a[idx] + scalar * b[idx];
185403adfb6SMatthew G Knepley idx += blockDim.x * gridDim.x;
186403adfb6SMatthew G Knepley }
187403adfb6SMatthew G Knepley }
188403adfb6SMatthew G Knepley
STREAM_Triad_double(double * a,double * b,double * c,double scalar,size_t len)18967595998SJunchao Zhang __global__ void STREAM_Triad_double(double *a, double *b, double *c, double scalar, size_t len)
19067595998SJunchao Zhang {
191caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
192caccb7e3SMatthew G Knepley while (idx < len) {
193caccb7e3SMatthew G Knepley c[idx] = a[idx] + scalar * b[idx];
194caccb7e3SMatthew G Knepley idx += blockDim.x * gridDim.x;
195caccb7e3SMatthew G Knepley }
196caccb7e3SMatthew G Knepley }
197caccb7e3SMatthew G Knepley
STREAM_Triad_Optimized(float * a,float * b,float * c,float scalar,size_t len)19867595998SJunchao Zhang __global__ void STREAM_Triad_Optimized(float *a, float *b, float *c, float scalar, size_t len)
19967595998SJunchao Zhang {
200caccb7e3SMatthew G Knepley /*
201caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
202caccb7e3SMatthew G Knepley * vector index space else return.
203caccb7e3SMatthew G Knepley */
204caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
205caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
206caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + scalar * b[idx];
207caccb7e3SMatthew G Knepley }
208caccb7e3SMatthew G Knepley
STREAM_Triad_Optimized_double(double * a,double * b,double * c,double scalar,size_t len)20967595998SJunchao Zhang __global__ void STREAM_Triad_Optimized_double(double *a, double *b, double *c, double scalar, size_t len)
21067595998SJunchao Zhang {
211caccb7e3SMatthew G Knepley /*
212caccb7e3SMatthew G Knepley * Ensure size of thread index space is as large as or greater than
213caccb7e3SMatthew G Knepley * vector index space else return.
214caccb7e3SMatthew G Knepley */
215caccb7e3SMatthew G Knepley if (blockDim.x * gridDim.x < len) return;
216caccb7e3SMatthew G Knepley size_t idx = threadIdx.x + blockIdx.x * blockDim.x;
217caccb7e3SMatthew G Knepley if (idx < len) c[idx] = a[idx] + scalar * b[idx];
218caccb7e3SMatthew G Knepley }
219caccb7e3SMatthew G Knepley
220403adfb6SMatthew G Knepley /* Host side verification routines */
STREAM_Copy_verify(float * a,float * b,size_t len)22167595998SJunchao Zhang bool STREAM_Copy_verify(float *a, float *b, size_t len)
22267595998SJunchao Zhang {
223403adfb6SMatthew G Knepley size_t idx;
224403adfb6SMatthew G Knepley bool bDifferent = false;
225403adfb6SMatthew G Knepley
226403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
227403adfb6SMatthew G Knepley float expectedResult = a[idx];
228403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult);
229403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
230403adfb6SMatthew G Knepley /* element-wise relative error determination */
231403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f);
232403adfb6SMatthew G Knepley }
233403adfb6SMatthew G Knepley
234403adfb6SMatthew G Knepley return bDifferent;
235403adfb6SMatthew G Knepley }
236403adfb6SMatthew G Knepley
STREAM_Copy_verify_double(double * a,double * b,size_t len)23767595998SJunchao Zhang bool STREAM_Copy_verify_double(double *a, double *b, size_t len)
23867595998SJunchao Zhang {
239caccb7e3SMatthew G Knepley size_t idx;
240caccb7e3SMatthew G Knepley bool bDifferent = false;
241caccb7e3SMatthew G Knepley
242caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
243caccb7e3SMatthew G Knepley double expectedResult = a[idx];
244caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult);
24519816777SMark double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / dbl_eps;
246caccb7e3SMatthew G Knepley /* element-wise relative error determination */
247caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.);
248caccb7e3SMatthew G Knepley }
249caccb7e3SMatthew G Knepley
250caccb7e3SMatthew G Knepley return bDifferent;
251caccb7e3SMatthew G Knepley }
252caccb7e3SMatthew G Knepley
STREAM_Scale_verify(float * a,float * b,float scale,size_t len)25367595998SJunchao Zhang bool STREAM_Scale_verify(float *a, float *b, float scale, size_t len)
25467595998SJunchao Zhang {
255403adfb6SMatthew G Knepley size_t idx;
256403adfb6SMatthew G Knepley bool bDifferent = false;
257403adfb6SMatthew G Knepley
258403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
259403adfb6SMatthew G Knepley float expectedResult = scale * a[idx];
260403adfb6SMatthew G Knepley float diffResultExpected = (b[idx] - expectedResult);
261403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
262403adfb6SMatthew G Knepley /* element-wise relative error determination */
263403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f);
264403adfb6SMatthew G Knepley }
265403adfb6SMatthew G Knepley
266403adfb6SMatthew G Knepley return bDifferent;
267403adfb6SMatthew G Knepley }
268403adfb6SMatthew G Knepley
STREAM_Scale_verify_double(double * a,double * b,double scale,size_t len)26967595998SJunchao Zhang bool STREAM_Scale_verify_double(double *a, double *b, double scale, size_t len)
27067595998SJunchao Zhang {
271caccb7e3SMatthew G Knepley size_t idx;
272caccb7e3SMatthew G Knepley bool bDifferent = false;
273caccb7e3SMatthew G Knepley
274caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
275caccb7e3SMatthew G Knepley double expectedResult = scale * a[idx];
276caccb7e3SMatthew G Knepley double diffResultExpected = (b[idx] - expectedResult);
277caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
278caccb7e3SMatthew G Knepley /* element-wise relative error determination */
279caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.);
280caccb7e3SMatthew G Knepley }
281caccb7e3SMatthew G Knepley
282caccb7e3SMatthew G Knepley return bDifferent;
283caccb7e3SMatthew G Knepley }
284caccb7e3SMatthew G Knepley
STREAM_Add_verify(float * a,float * b,float * c,size_t len)28567595998SJunchao Zhang bool STREAM_Add_verify(float *a, float *b, float *c, size_t len)
28667595998SJunchao Zhang {
287403adfb6SMatthew G Knepley size_t idx;
288403adfb6SMatthew G Knepley bool bDifferent = false;
289403adfb6SMatthew G Knepley
290403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
291403adfb6SMatthew G Knepley float expectedResult = a[idx] + b[idx];
292403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult);
293403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
294403adfb6SMatthew G Knepley /* element-wise relative error determination */
295403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 2.f);
296403adfb6SMatthew G Knepley }
297403adfb6SMatthew G Knepley
298403adfb6SMatthew G Knepley return bDifferent;
299403adfb6SMatthew G Knepley }
300403adfb6SMatthew G Knepley
STREAM_Add_verify_double(double * a,double * b,double * c,size_t len)30167595998SJunchao Zhang bool STREAM_Add_verify_double(double *a, double *b, double *c, size_t len)
30267595998SJunchao Zhang {
303caccb7e3SMatthew G Knepley size_t idx;
304caccb7e3SMatthew G Knepley bool bDifferent = false;
305caccb7e3SMatthew G Knepley
306caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
307caccb7e3SMatthew G Knepley double expectedResult = a[idx] + b[idx];
308caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult);
309caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
310caccb7e3SMatthew G Knepley /* element-wise relative error determination */
311caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 2.);
312caccb7e3SMatthew G Knepley }
313caccb7e3SMatthew G Knepley
314caccb7e3SMatthew G Knepley return bDifferent;
315caccb7e3SMatthew G Knepley }
316caccb7e3SMatthew G Knepley
STREAM_Triad_verify(float * a,float * b,float * c,float scalar,size_t len)31767595998SJunchao Zhang bool STREAM_Triad_verify(float *a, float *b, float *c, float scalar, size_t len)
31867595998SJunchao Zhang {
319403adfb6SMatthew G Knepley size_t idx;
320403adfb6SMatthew G Knepley bool bDifferent = false;
321403adfb6SMatthew G Knepley
322403adfb6SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
323403adfb6SMatthew G Knepley float expectedResult = a[idx] + scalar * b[idx];
324403adfb6SMatthew G Knepley float diffResultExpected = (c[idx] - expectedResult);
325403adfb6SMatthew G Knepley float relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
326403adfb6SMatthew G Knepley /* element-wise relative error determination */
327403adfb6SMatthew G Knepley bDifferent = (relErrorULPS > 3.f);
328403adfb6SMatthew G Knepley }
329403adfb6SMatthew G Knepley
330403adfb6SMatthew G Knepley return bDifferent;
331403adfb6SMatthew G Knepley }
332403adfb6SMatthew G Knepley
STREAM_Triad_verify_double(double * a,double * b,double * c,double scalar,size_t len)33367595998SJunchao Zhang bool STREAM_Triad_verify_double(double *a, double *b, double *c, double scalar, size_t len)
33467595998SJunchao Zhang {
335caccb7e3SMatthew G Knepley size_t idx;
336caccb7e3SMatthew G Knepley bool bDifferent = false;
337caccb7e3SMatthew G Knepley
338caccb7e3SMatthew G Knepley for (idx = 0; idx < len && !bDifferent; idx++) {
339caccb7e3SMatthew G Knepley double expectedResult = a[idx] + scalar * b[idx];
340caccb7e3SMatthew G Knepley double diffResultExpected = (c[idx] - expectedResult);
341caccb7e3SMatthew G Knepley double relErrorULPS = (fabsf(diffResultExpected) / fabsf(expectedResult)) / flt_eps;
342caccb7e3SMatthew G Knepley /* element-wise relative error determination */
343caccb7e3SMatthew G Knepley bDifferent = (relErrorULPS > 3.);
344caccb7e3SMatthew G Knepley }
345caccb7e3SMatthew G Knepley
346caccb7e3SMatthew G Knepley return bDifferent;
347caccb7e3SMatthew G Knepley }
348caccb7e3SMatthew G Knepley
349403adfb6SMatthew G Knepley /* forward declarations */
350caccb7e3SMatthew G Knepley PetscErrorCode setupStream(PetscInt device, PetscBool runDouble, PetscBool cpuTiming);
351403adfb6SMatthew G Knepley PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
352caccb7e3SMatthew G Knepley PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming);
35319816777SMark PetscErrorCode printResultsReadable(float times[][NTIMES], size_t);
354403adfb6SMatthew G Knepley
main(int argc,char * argv[])35567595998SJunchao Zhang int main(int argc, char *argv[])
35667595998SJunchao Zhang {
357403adfb6SMatthew G Knepley PetscInt device = 0;
35819816777SMark PetscBool runDouble = PETSC_TRUE;
35919816777SMark const PetscBool cpuTiming = PETSC_TRUE; // must be true
360403adfb6SMatthew G Knepley PetscErrorCode ierr;
361403adfb6SMatthew G Knepley
3629566063dSJacob Faibussowitsch PetscCallCUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
36319816777SMark
3649566063dSJacob Faibussowitsch PetscCall(PetscInitialize(&argc, &argv, 0, help));
365403adfb6SMatthew G Knepley
366d0609cedSBarry Smith PetscOptionsBegin(PETSC_COMM_WORLD, "", "STREAM Benchmark Options", "STREAM");
3679566063dSJacob Faibussowitsch PetscCall(PetscOptionsBoundedInt("-device", "Specify the CUDA device to be used", "STREAM", device, &device, NULL, 0));
3689566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-double", "Also run double precision tests", "STREAM", runDouble, &runDouble, NULL));
369d0609cedSBarry Smith PetscOptionsEnd();
370403adfb6SMatthew G Knepley
371caccb7e3SMatthew G Knepley ierr = setupStream(device, runDouble, cpuTiming);
3723a7d0413SPierre Jolivet if (ierr) PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n[streamBenchmark] - results:\t%s\n\n", (ierr == 0) ? "PASSES" : "FAILED"));
3739566063dSJacob Faibussowitsch PetscCall(PetscFinalize());
374b122ec5aSJacob Faibussowitsch return 0;
375403adfb6SMatthew G Knepley }
376403adfb6SMatthew G Knepley
377403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
378403adfb6SMatthew G Knepley //Run the appropriate tests
379403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////////
setupStream(PetscInt deviceNum,PetscBool runDouble,PetscBool cpuTiming)38067595998SJunchao Zhang PetscErrorCode setupStream(PetscInt deviceNum, PetscBool runDouble, PetscBool cpuTiming)
38167595998SJunchao Zhang {
382403adfb6SMatthew G Knepley PetscInt iNumThreadsPerBlock = 128;
383403adfb6SMatthew G Knepley
384403adfb6SMatthew G Knepley PetscFunctionBegin;
385403adfb6SMatthew G Knepley // Check device
386403adfb6SMatthew G Knepley {
387403adfb6SMatthew G Knepley int deviceCount;
388403adfb6SMatthew G Knepley
389403adfb6SMatthew G Knepley cudaGetDeviceCount(&deviceCount);
390403adfb6SMatthew G Knepley if (deviceCount == 0) {
3919566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "!!!!!No devices found!!!!!\n"));
392403adfb6SMatthew G Knepley return -1000;
393403adfb6SMatthew G Knepley }
394403adfb6SMatthew G Knepley
395403adfb6SMatthew G Knepley if (deviceNum >= deviceCount || deviceNum < 0) {
3969566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n!!!!!Invalid GPU number %d given hence default gpu %d will be used !!!!!\n", deviceNum, 0));
397403adfb6SMatthew G Knepley deviceNum = 0;
398403adfb6SMatthew G Knepley }
399403adfb6SMatthew G Knepley }
400403adfb6SMatthew G Knepley
401403adfb6SMatthew G Knepley cudaSetDevice(deviceNum);
4029566063dSJacob Faibussowitsch // PetscCall(PetscPrintf(PETSC_COMM_SELF, "Running on...\n\n"));
403403adfb6SMatthew G Knepley cudaDeviceProp deviceProp;
40419816777SMark if (cudaGetDeviceProperties(&deviceProp, deviceNum) != cudaSuccess) {
4059566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to determine device %d properties, exiting\n"));
406403adfb6SMatthew G Knepley return -1;
407403adfb6SMatthew G Knepley }
408403adfb6SMatthew G Knepley
409caccb7e3SMatthew G Knepley if (runDouble && deviceProp.major == 1 && deviceProp.minor < 3) {
4109566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " Unable to run double-precision STREAM benchmark on a compute capability GPU less than 1.3\n"));
411caccb7e3SMatthew G Knepley return -1;
412caccb7e3SMatthew G Knepley }
4136f2b61bcSKarl Rupp if (deviceProp.major == 2 && deviceProp.minor == 1) iNumThreadsPerBlock = 192; /* GF104 architecture / 48 CUDA Cores per MP */
4146f2b61bcSKarl Rupp else iNumThreadsPerBlock = 128; /* GF100 architecture / 32 CUDA Cores per MP */
415403adfb6SMatthew G Knepley
4161baa6e33SBarry Smith if (runDouble) PetscCall(runStreamDouble(iNumThreadsPerBlock, cpuTiming));
4171baa6e33SBarry Smith else PetscCall(runStream(iNumThreadsPerBlock, cpuTiming));
4183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
419403adfb6SMatthew G Knepley }
420403adfb6SMatthew G Knepley
421403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
422403adfb6SMatthew G Knepley // runStream
423403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
runStream(const PetscInt iNumThreadsPerBlock,PetscBool bDontUseGPUTiming)42467595998SJunchao Zhang PetscErrorCode runStream(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
42567595998SJunchao Zhang {
426403adfb6SMatthew G Knepley float *d_a, *d_b, *d_c;
427403adfb6SMatthew G Knepley int k;
428caccb7e3SMatthew G Knepley float times[8][NTIMES];
429403adfb6SMatthew G Knepley float scalar;
430403adfb6SMatthew G Knepley
431403adfb6SMatthew G Knepley PetscFunctionBegin;
432403adfb6SMatthew G Knepley /* Allocate memory on device */
4339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(float) * N));
4349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(float) * N));
4359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(float) * N));
436403adfb6SMatthew G Knepley
437403adfb6SMatthew G Knepley /* Compute execution configuration */
438403adfb6SMatthew G Knepley
439403adfb6SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
440403adfb6SMatthew G Knepley dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */
441403adfb6SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x += 1;
442403adfb6SMatthew G Knepley
443403adfb6SMatthew G Knepley /* Initialize memory on the device */
444403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
445403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
446403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
447403adfb6SMatthew G Knepley
448403adfb6SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
449403adfb6SMatthew G Knepley PetscLogDouble cpuTimer = 0.0;
450403adfb6SMatthew G Knepley
451403adfb6SMatthew G Knepley scalar = 3.0f;
452403adfb6SMatthew G Knepley for (k = 0; k < NTIMES; ++k) {
4538563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
454403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
45519816777SMark cudaStreamSynchronize(NULL);
4569566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
4578563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
45819816777SMark if (bDontUseGPUTiming) times[0][k] = cpuTimer * 1.e3; // millisec
459403adfb6SMatthew G Knepley
460403adfb6SMatthew G Knepley cpuTimer = 0.0;
4618563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
462403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
46319816777SMark cudaStreamSynchronize(NULL);
4649566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
465df3898eeSBarry Smith //get the total elapsed time in ms
4668563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
46719816777SMark if (bDontUseGPUTiming) times[1][k] = cpuTimer * 1.e3;
468403adfb6SMatthew G Knepley
469403adfb6SMatthew G Knepley cpuTimer = 0.0;
4708563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
471403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
47219816777SMark cudaStreamSynchronize(NULL);
4739566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
474df3898eeSBarry Smith //get the total elapsed time in ms
4758563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
47619816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
477403adfb6SMatthew G Knepley
478403adfb6SMatthew G Knepley cpuTimer = 0.0;
4798563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
480caccb7e3SMatthew G Knepley STREAM_Scale_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
48119816777SMark cudaStreamSynchronize(NULL);
4829566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
483df3898eeSBarry Smith //get the total elapsed time in ms
4848563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
48519816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
486403adfb6SMatthew G Knepley
487403adfb6SMatthew G Knepley cpuTimer = 0.0;
4888563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
4899566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventRecord(start, 0));
490caccb7e3SMatthew G Knepley STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
49119816777SMark cudaStreamSynchronize(NULL);
4929566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
4939566063dSJacob Faibussowitsch PetscCallCUDA(cudaEventRecord(stop, 0));
4949566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventSynchronize(stop));
495df3898eeSBarry Smith //get the total elapsed time in ms
4968563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
49719816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
4986f2b61bcSKarl Rupp else {
4999566063dSJacob Faibussowitsch // PetscCallCUDA(cudaEventElapsedTime(×[4][k], start, stop));
500403adfb6SMatthew G Knepley }
501403adfb6SMatthew G Knepley
502caccb7e3SMatthew G Knepley cpuTimer = 0.0;
5038563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
504caccb7e3SMatthew G Knepley STREAM_Add_Optimized<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
50519816777SMark cudaStreamSynchronize(NULL);
5069566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
507df3898eeSBarry Smith //get the total elapsed time in ms
5088563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
50919816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
510caccb7e3SMatthew G Knepley
511caccb7e3SMatthew G Knepley cpuTimer = 0.0;
5128563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
513caccb7e3SMatthew G Knepley STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
51419816777SMark cudaStreamSynchronize(NULL);
5159566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
516df3898eeSBarry Smith //get the total elapsed time in ms
5178563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
51819816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
519caccb7e3SMatthew G Knepley
520caccb7e3SMatthew G Knepley cpuTimer = 0.0;
5218563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
522caccb7e3SMatthew G Knepley STREAM_Triad_Optimized<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
52319816777SMark cudaStreamSynchronize(NULL);
5249566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
525df3898eeSBarry Smith //get the total elapsed time in ms
5268563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
52719816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
528caccb7e3SMatthew G Knepley }
529caccb7e3SMatthew G Knepley
53019816777SMark if (1) { /* verify kernels */
531403adfb6SMatthew G Knepley float *h_a, *h_b, *h_c;
532403adfb6SMatthew G Knepley bool errorSTREAMkernel = true;
533403adfb6SMatthew G Knepley
534403adfb6SMatthew G Knepley if ((h_a = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
535403adfb6SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n");
536403adfb6SMatthew G Knepley exit(1);
537403adfb6SMatthew G Knepley }
538403adfb6SMatthew G Knepley if ((h_b = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
539403adfb6SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n");
540403adfb6SMatthew G Knepley exit(1);
541403adfb6SMatthew G Knepley }
542403adfb6SMatthew G Knepley
543403adfb6SMatthew G Knepley if ((h_c = (float *)calloc(N, sizeof(float))) == (float *)NULL) {
544403adfb6SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n");
545403adfb6SMatthew G Knepley exit(1);
546403adfb6SMatthew G Knepley }
547403adfb6SMatthew G Knepley
548403adfb6SMatthew G Knepley /*
549403adfb6SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each
550403adfb6SMatthew G Knepley * device kernel output
551403adfb6SMatthew G Knepley */
552403adfb6SMatthew G Knepley
553403adfb6SMatthew G Knepley /* Initialize memory on the device */
554403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
555403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
556403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
557403adfb6SMatthew G Knepley
558403adfb6SMatthew G Knepley STREAM_Copy<<<dimGrid, dimBlock>>>(d_a, d_c, N);
5599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
561403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
562403adfb6SMatthew G Knepley if (errorSTREAMkernel) {
5639566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
564403adfb6SMatthew G Knepley exit(-2000);
565403adfb6SMatthew G Knepley }
566403adfb6SMatthew G Knepley
567403adfb6SMatthew G Knepley /* Initialize memory on the device */
568403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
569403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
570403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
571403adfb6SMatthew G Knepley
572403adfb6SMatthew G Knepley STREAM_Copy_Optimized<<<dimGrid, dimBlock>>>(d_a, d_c, N);
5739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
5749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
575403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify(h_a, h_c, N);
576403adfb6SMatthew G Knepley if (errorSTREAMkernel) {
5779566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
578403adfb6SMatthew G Knepley exit(-3000);
579403adfb6SMatthew G Knepley }
580403adfb6SMatthew G Knepley
581403adfb6SMatthew G Knepley /* Initialize memory on the device */
58219816777SMark set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
583403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
584403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
585403adfb6SMatthew G Knepley
586403adfb6SMatthew G Knepley STREAM_Scale<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
5879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
5889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
589403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify(h_b, h_c, scalar, N);
590403adfb6SMatthew G Knepley if (errorSTREAMkernel) {
5919566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
592403adfb6SMatthew G Knepley exit(-4000);
593403adfb6SMatthew G Knepley }
594403adfb6SMatthew G Knepley
595403adfb6SMatthew G Knepley /* Initialize memory on the device */
596403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
597403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
598403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
599403adfb6SMatthew G Knepley
600403adfb6SMatthew G Knepley STREAM_Add<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
6019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
6029566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
6039566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
604403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify(h_a, h_b, h_c, N);
605403adfb6SMatthew G Knepley if (errorSTREAMkernel) {
6069566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
607403adfb6SMatthew G Knepley exit(-5000);
608403adfb6SMatthew G Knepley }
609403adfb6SMatthew G Knepley
610403adfb6SMatthew G Knepley /* Initialize memory on the device */
611403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_a, 2.f, N);
612403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_b, .5f, N);
613403adfb6SMatthew G Knepley set_array<<<dimGrid, dimBlock>>>(d_c, .5f, N);
614403adfb6SMatthew G Knepley
615403adfb6SMatthew G Knepley STREAM_Triad<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
6169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(float) * N, cudaMemcpyDeviceToHost));
6179566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(float) * N, cudaMemcpyDeviceToHost));
6189566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(float) * N, cudaMemcpyDeviceToHost));
619403adfb6SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify(h_b, h_c, h_a, scalar, N);
620403adfb6SMatthew G Knepley if (errorSTREAMkernel) {
6219566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
622403adfb6SMatthew G Knepley exit(-6000);
623403adfb6SMatthew G Knepley }
624403adfb6SMatthew G Knepley
62519816777SMark free(h_a);
62619816777SMark free(h_b);
62719816777SMark free(h_c);
62819816777SMark }
629403adfb6SMatthew G Knepley /* continue from here */
63019816777SMark printResultsReadable(times, sizeof(float));
631403adfb6SMatthew G Knepley
632403adfb6SMatthew G Knepley /* Free memory on device */
6339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_a));
6349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_b));
6359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_c));
6363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
637403adfb6SMatthew G Knepley }
638403adfb6SMatthew G Knepley
runStreamDouble(const PetscInt iNumThreadsPerBlock,PetscBool bDontUseGPUTiming)63967595998SJunchao Zhang PetscErrorCode runStreamDouble(const PetscInt iNumThreadsPerBlock, PetscBool bDontUseGPUTiming)
64067595998SJunchao Zhang {
641caccb7e3SMatthew G Knepley double *d_a, *d_b, *d_c;
642caccb7e3SMatthew G Knepley int k;
643caccb7e3SMatthew G Knepley float times[8][NTIMES];
644caccb7e3SMatthew G Knepley double scalar;
645caccb7e3SMatthew G Knepley
646caccb7e3SMatthew G Knepley PetscFunctionBegin;
647caccb7e3SMatthew G Knepley /* Allocate memory on device */
6489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_a, sizeof(double) * N));
6499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_b, sizeof(double) * N));
6509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_c, sizeof(double) * N));
651caccb7e3SMatthew G Knepley
652caccb7e3SMatthew G Knepley /* Compute execution configuration */
653caccb7e3SMatthew G Knepley
654caccb7e3SMatthew G Knepley dim3 dimBlock(iNumThreadsPerBlock); /* (iNumThreadsPerBlock,1,1) */
655caccb7e3SMatthew G Knepley dim3 dimGrid(N / dimBlock.x); /* (N/dimBlock.x,1,1) */
656caccb7e3SMatthew G Knepley if (N % dimBlock.x != 0) dimGrid.x += 1;
657caccb7e3SMatthew G Knepley
658caccb7e3SMatthew G Knepley /* Initialize memory on the device */
659caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
660caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
661caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
662caccb7e3SMatthew G Knepley
663caccb7e3SMatthew G Knepley /* --- MAIN LOOP --- repeat test cases NTIMES times --- */
664caccb7e3SMatthew G Knepley PetscLogDouble cpuTimer = 0.0;
665caccb7e3SMatthew G Knepley
666caccb7e3SMatthew G Knepley scalar = 3.0;
667caccb7e3SMatthew G Knepley for (k = 0; k < NTIMES; ++k) {
6688563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
669caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
67019816777SMark cudaStreamSynchronize(NULL);
6719566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
672df3898eeSBarry Smith //get the total elapsed time in ms
673caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) {
6748563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
67519816777SMark times[0][k] = cpuTimer * 1.e3;
676caccb7e3SMatthew G Knepley }
677caccb7e3SMatthew G Knepley
678caccb7e3SMatthew G Knepley cpuTimer = 0.0;
6798563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
680caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
68119816777SMark cudaStreamSynchronize(NULL);
6829566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
683df3898eeSBarry Smith //get the total elapsed time in ms
684caccb7e3SMatthew G Knepley if (bDontUseGPUTiming) {
6858563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
68619816777SMark times[1][k] = cpuTimer * 1.e3;
687caccb7e3SMatthew G Knepley }
688caccb7e3SMatthew G Knepley
689caccb7e3SMatthew G Knepley cpuTimer = 0.0;
6908563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
691caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
69219816777SMark cudaStreamSynchronize(NULL);
6939566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
694df3898eeSBarry Smith //get the total elapsed time in ms
6958563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
69619816777SMark if (bDontUseGPUTiming) times[2][k] = cpuTimer * 1.e3;
697caccb7e3SMatthew G Knepley
698caccb7e3SMatthew G Knepley cpuTimer = 0.0;
6998563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
700caccb7e3SMatthew G Knepley STREAM_Scale_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
70119816777SMark cudaStreamSynchronize(NULL);
7029566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
703df3898eeSBarry Smith //get the total elapsed time in ms
7048563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
70519816777SMark if (bDontUseGPUTiming) times[3][k] = cpuTimer * 1.e3;
706caccb7e3SMatthew G Knepley
707caccb7e3SMatthew G Knepley cpuTimer = 0.0;
7088563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
709caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
71019816777SMark cudaStreamSynchronize(NULL);
7119566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
712df3898eeSBarry Smith //get the total elapsed time in ms
7138563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
71419816777SMark if (bDontUseGPUTiming) times[4][k] = cpuTimer * 1.e3;
715caccb7e3SMatthew G Knepley
716caccb7e3SMatthew G Knepley cpuTimer = 0.0;
7178563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
718caccb7e3SMatthew G Knepley STREAM_Add_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
71919816777SMark cudaStreamSynchronize(NULL);
7209566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
721df3898eeSBarry Smith //get the total elapsed time in ms
7228563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
72319816777SMark if (bDontUseGPUTiming) times[5][k] = cpuTimer * 1.e3;
724caccb7e3SMatthew G Knepley
725caccb7e3SMatthew G Knepley cpuTimer = 0.0;
7268563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
727caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
72819816777SMark cudaStreamSynchronize(NULL);
7299566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
730df3898eeSBarry Smith //get the total elapsed time in ms
7318563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
73219816777SMark if (bDontUseGPUTiming) times[6][k] = cpuTimer * 1.e3;
733caccb7e3SMatthew G Knepley
734caccb7e3SMatthew G Knepley cpuTimer = 0.0;
7358563dfccSBarry Smith PetscTimeSubtract(&cpuTimer);
736caccb7e3SMatthew G Knepley STREAM_Triad_Optimized_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
73719816777SMark cudaStreamSynchronize(NULL);
7389566063dSJacob Faibussowitsch PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));
739df3898eeSBarry Smith //get the total elapsed time in ms
7408563dfccSBarry Smith PetscTimeAdd(&cpuTimer);
74119816777SMark if (bDontUseGPUTiming) times[7][k] = cpuTimer * 1.e3;
742caccb7e3SMatthew G Knepley }
743caccb7e3SMatthew G Knepley
74419816777SMark if (1) { /* verify kernels */
745caccb7e3SMatthew G Knepley double *h_a, *h_b, *h_c;
746caccb7e3SMatthew G Knepley bool errorSTREAMkernel = true;
747caccb7e3SMatthew G Knepley
748caccb7e3SMatthew G Knepley if ((h_a = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
749caccb7e3SMatthew G Knepley printf("Unable to allocate array h_a, exiting ...\n");
750caccb7e3SMatthew G Knepley exit(1);
751caccb7e3SMatthew G Knepley }
752caccb7e3SMatthew G Knepley if ((h_b = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
753caccb7e3SMatthew G Knepley printf("Unable to allocate array h_b, exiting ...\n");
754caccb7e3SMatthew G Knepley exit(1);
755caccb7e3SMatthew G Knepley }
756caccb7e3SMatthew G Knepley
757caccb7e3SMatthew G Knepley if ((h_c = (double *)calloc(N, sizeof(double))) == (double *)NULL) {
758caccb7e3SMatthew G Knepley printf("Unalbe to allocate array h_c, exiting ...\n");
759caccb7e3SMatthew G Knepley exit(1);
760caccb7e3SMatthew G Knepley }
761caccb7e3SMatthew G Knepley
762caccb7e3SMatthew G Knepley /*
763caccb7e3SMatthew G Knepley * perform kernel, copy device memory into host memory and verify each
764caccb7e3SMatthew G Knepley * device kernel output
765caccb7e3SMatthew G Knepley */
766caccb7e3SMatthew G Knepley
767caccb7e3SMatthew G Knepley /* Initialize memory on the device */
768caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
769caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
770caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
771caccb7e3SMatthew G Knepley
772caccb7e3SMatthew G Knepley STREAM_Copy_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
7739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
7749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
775caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
776caccb7e3SMatthew G Knepley if (errorSTREAMkernel) {
7779566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy:\t\tError detected in device STREAM_Copy, exiting\n"));
778caccb7e3SMatthew G Knepley exit(-2000);
779caccb7e3SMatthew G Knepley }
780caccb7e3SMatthew G Knepley
781caccb7e3SMatthew G Knepley /* Initialize memory on the device */
782caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
783caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
784caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
785caccb7e3SMatthew G Knepley
786caccb7e3SMatthew G Knepley STREAM_Copy_Optimized_double<<<dimGrid, dimBlock>>>(d_a, d_c, N);
7879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
7889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
789caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Copy_verify_double(h_a, h_c, N);
790caccb7e3SMatthew G Knepley if (errorSTREAMkernel) {
7919566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Copy_Optimized:\tError detected in device STREAM_Copy_Optimized, exiting\n"));
792caccb7e3SMatthew G Knepley exit(-3000);
793caccb7e3SMatthew G Knepley }
794caccb7e3SMatthew G Knepley
795caccb7e3SMatthew G Knepley /* Initialize memory on the device */
796caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
797caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
798caccb7e3SMatthew G Knepley
799caccb7e3SMatthew G Knepley STREAM_Scale_double<<<dimGrid, dimBlock>>>(d_b, d_c, scalar, N);
8009566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
8019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
802caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Scale_verify_double(h_b, h_c, scalar, N);
803caccb7e3SMatthew G Knepley if (errorSTREAMkernel) {
8049566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Scale:\t\tError detected in device STREAM_Scale, exiting\n"));
805caccb7e3SMatthew G Knepley exit(-4000);
806caccb7e3SMatthew G Knepley }
807caccb7e3SMatthew G Knepley
808caccb7e3SMatthew G Knepley /* Initialize memory on the device */
809caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
810caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
811caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
812caccb7e3SMatthew G Knepley
813caccb7e3SMatthew G Knepley STREAM_Add_double<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, N);
8149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
8159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
8169566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
817caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Add_verify_double(h_a, h_b, h_c, N);
818caccb7e3SMatthew G Knepley if (errorSTREAMkernel) {
8199566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Add:\t\tError detected in device STREAM_Add, exiting\n"));
820caccb7e3SMatthew G Knepley exit(-5000);
821caccb7e3SMatthew G Knepley }
822caccb7e3SMatthew G Knepley
823caccb7e3SMatthew G Knepley /* Initialize memory on the device */
824caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_a, 2., N);
825caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_b, .5, N);
826caccb7e3SMatthew G Knepley set_array_double<<<dimGrid, dimBlock>>>(d_c, .5, N);
827caccb7e3SMatthew G Knepley
828caccb7e3SMatthew G Knepley STREAM_Triad_double<<<dimGrid, dimBlock>>>(d_b, d_c, d_a, scalar, N);
8299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_a, d_a, sizeof(double) * N, cudaMemcpyDeviceToHost));
8309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_b, d_b, sizeof(double) * N, cudaMemcpyDeviceToHost));
8319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_c, d_c, sizeof(double) * N, cudaMemcpyDeviceToHost));
832caccb7e3SMatthew G Knepley errorSTREAMkernel = STREAM_Triad_verify_double(h_b, h_c, h_a, scalar, N);
833caccb7e3SMatthew G Knepley if (errorSTREAMkernel) {
8349566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, " device STREAM_Triad:\t\tError detected in device STREAM_Triad, exiting\n"));
835caccb7e3SMatthew G Knepley exit(-6000);
836caccb7e3SMatthew G Knepley }
837caccb7e3SMatthew G Knepley
83819816777SMark free(h_a);
83919816777SMark free(h_b);
84019816777SMark free(h_c);
84119816777SMark }
842caccb7e3SMatthew G Knepley /* continue from here */
84319816777SMark printResultsReadable(times, sizeof(double));
844caccb7e3SMatthew G Knepley
845caccb7e3SMatthew G Knepley /* Free memory on device */
8469566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_a));
8479566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_b));
8489566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_c));
8493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
850caccb7e3SMatthew G Knepley }
851caccb7e3SMatthew G Knepley
852403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
853403adfb6SMatthew G Knepley //Print Results to Screen and File
854403adfb6SMatthew G Knepley ///////////////////////////////////////////////////////////////////////////
printResultsReadable(float times[][NTIMES],const size_t bsize)85567595998SJunchao Zhang PetscErrorCode printResultsReadable(float times[][NTIMES], const size_t bsize)
85667595998SJunchao Zhang {
857403adfb6SMatthew G Knepley PetscErrorCode ierr;
858403adfb6SMatthew G Knepley PetscInt j, k;
859caccb7e3SMatthew G Knepley float avgtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
860caccb7e3SMatthew G Knepley float maxtime[8] = {0., 0., 0., 0., 0., 0., 0., 0.};
861caccb7e3SMatthew G Knepley float mintime[8] = {1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30, 1e30};
86219816777SMark // char *label[8] = {"Copy: ", "Copy Opt.: ", "Scale: ", "Scale Opt: ", "Add: ", "Add Opt: ", "Triad: ", "Triad Opt: "};
8630e6b6b59SJacob Faibussowitsch const float bytes_per_kernel[8] = {2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 2. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N, 3. * bsize * N};
86419816777SMark double rate, irate;
86519816777SMark int rank, size;
8664d86920dSPierre Jolivet
867403adfb6SMatthew G Knepley PetscFunctionBegin;
8689566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
8699566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(MPI_COMM_WORLD, &size));
870403adfb6SMatthew G Knepley /* --- SUMMARY --- */
87119816777SMark for (k = 0; k < NTIMES; ++k) {
87267595998SJunchao Zhang for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) {
87319816777SMark avgtime[j] = avgtime[j] + (1.e-03f * times[j][k]); // millisec --> sec
874403adfb6SMatthew G Knepley mintime[j] = MIN(mintime[j], (1.e-03f * times[j][k]));
875403adfb6SMatthew G Knepley maxtime[j] = MAX(maxtime[j], (1.e-03f * times[j][k]));
876403adfb6SMatthew G Knepley }
87719816777SMark }
87867595998SJunchao Zhang for (j = 0; j < (PetscInt)PETSC_STATIC_ARRAY_LENGTH(avgtime); ++j) avgtime[j] = avgtime[j] / (float)(NTIMES - 1);
87919816777SMark j = 7;
88019816777SMark irate = 1.0E-06 * bytes_per_kernel[j] / mintime[j];
88119816777SMark ierr = MPI_Reduce(&irate, &rate, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
882dd400576SPatrick Sanan if (rank == 0) {
88319816777SMark FILE *fd;
88419816777SMark if (size == 1) {
88519816777SMark printf("%d %11.4f Rate (MB/s)\n", size, rate);
88619816777SMark fd = fopen("flops", "w");
88719816777SMark fprintf(fd, "%g\n", rate);
88819816777SMark fclose(fd);
88919816777SMark } else {
89019816777SMark double prate;
89119816777SMark fd = fopen("flops", "r");
89219816777SMark fscanf(fd, "%lg", &prate);
89319816777SMark fclose(fd);
89419816777SMark printf("%d %11.4f Rate (MB/s) %g\n", size, rate, rate / prate);
89519816777SMark }
89619816777SMark }
8973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
898403adfb6SMatthew G Knepley }
899