xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 7ed3e4cde27d28430628eaa24b22da48dc51cc32)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed/ceed.h>
18 #include <cuda.h>
19 
20 //------------------------------------------------------------------------------
21 // Kernel for set value on device
22 //------------------------------------------------------------------------------
23 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size,
24                                  CeedScalar val) {
25   int idx = threadIdx.x + blockDim.x * blockIdx.x;
26   if (idx >= size)
27     return;
28   vec[idx] = val;
29 }
30 
31 //------------------------------------------------------------------------------
32 // Set value on device memory
33 //------------------------------------------------------------------------------
34 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedInt length,
35                                        CeedScalar val) {
36   const int bsize = 512;
37   const int vecsize = length;
38   int gridsize = vecsize / bsize;
39 
40   if (bsize * gridsize < vecsize)
41     gridsize += 1;
42   setValueK<<<gridsize,bsize>>>(d_array, length, val);
43   return 0;
44 }
45 
46 //------------------------------------------------------------------------------
47 // Kernel for taking reciprocal
48 //------------------------------------------------------------------------------
49 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) {
50   int idx = threadIdx.x + blockDim.x * blockIdx.x;
51   if (idx >= size)
52     return;
53   if (fabs(vec[idx]) > 1E-16)
54     vec[idx] = 1./vec[idx];
55 }
56 
57 //------------------------------------------------------------------------------
58 // Take vector reciprocal in device memory
59 //------------------------------------------------------------------------------
60 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedInt length) {
61   const int bsize = 512;
62   const int vecsize = length;
63   int gridsize = vecsize / bsize;
64 
65   if (bsize * gridsize < vecsize)
66     gridsize += 1;
67   rcpValueK<<<gridsize,bsize>>>(d_array, length);
68   return 0;
69 }
70 
71 //------------------------------------------------------------------------------
72 // Kernel for scale
73 //------------------------------------------------------------------------------
74 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
75     CeedInt size) {
76   int idx = threadIdx.x + blockDim.x * blockIdx.x;
77   if (idx >= size)
78     return;
79   x[idx] *= alpha;
80 }
81 
82 //------------------------------------------------------------------------------
83 // Compute x = alpha x on device
84 //------------------------------------------------------------------------------
85 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
86     CeedInt length) {
87   const int bsize = 512;
88   const int vecsize = length;
89   int gridsize = vecsize / bsize;
90 
91   if (bsize * gridsize < vecsize)
92     gridsize += 1;
93   scaleValueK<<<gridsize,bsize>>>(x_array, alpha, length);
94   return 0;
95 }
96 
97 //------------------------------------------------------------------------------
98 // Kernel for axpy
99 //------------------------------------------------------------------------------
100 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
101     CeedScalar * __restrict__ x, CeedInt size) {
102   int idx = threadIdx.x + blockDim.x * blockIdx.x;
103   if (idx >= size)
104     return;
105   y[idx] += alpha * x[idx];
106 }
107 
108 //------------------------------------------------------------------------------
109 // Compute y = alpha x + y on device
110 //------------------------------------------------------------------------------
111 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
112     CeedScalar *x_array, CeedInt length) {
113   const int bsize = 512;
114   const int vecsize = length;
115   int gridsize = vecsize / bsize;
116 
117   if (bsize * gridsize < vecsize)
118     gridsize += 1;
119   axpyValueK<<<gridsize,bsize>>>(y_array, alpha, x_array, length);
120   return 0;
121 }
122 
123 //------------------------------------------------------------------------------
124 // Kernel for pointwise mult
125 //------------------------------------------------------------------------------
126 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
127     CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) {
128   int idx = threadIdx.x + blockDim.x * blockIdx.x;
129   if (idx >= size)
130     return;
131   w[idx] = x[idx] * y[idx];
132 }
133 
134 //------------------------------------------------------------------------------
135 // Compute the pointwise multiplication w = x .* y on device
136 //------------------------------------------------------------------------------
137 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
138     CeedScalar *y_array, CeedInt length) {
139   const int bsize = 512;
140   const int vecsize = length;
141   int gridsize = vecsize / bsize;
142 
143   if (bsize * gridsize < vecsize)
144     gridsize += 1;
145   pointwiseMultValueK<<<gridsize,bsize>>>(w_array, x_array, y_array, length);
146   return 0;
147 }
148