xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 1411c2625e7443874400b7f5ba33523876332c8a)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <cuda.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for set value on device
13 //------------------------------------------------------------------------------
14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
15                                  CeedScalar val) {
16   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17   if (index >= size)
18     return;
19   vec[index] = val;
20 }
21 
22 //------------------------------------------------------------------------------
23 // Set value on device memory
24 //------------------------------------------------------------------------------
25 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
26                                        CeedScalar val) {
27   const int block_size = 512;
28   const CeedSize vec_size = length;
29   int grid_size = vec_size / block_size;
30 
31   if (block_size * grid_size < vec_size)
32     grid_size += 1;
33   setValueK<<<grid_size,block_size>>>(d_array, length, val);
34   return 0;
35 }
36 
37 //------------------------------------------------------------------------------
38 // Kernel for taking reciprocal
39 //------------------------------------------------------------------------------
40 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
41   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
42   if (index >= size)
43     return;
44   if (fabs(vec[index]) > 1E-16)
45     vec[index] = 1./vec[index];
46 }
47 
48 //------------------------------------------------------------------------------
49 // Take vector reciprocal in device memory
50 //------------------------------------------------------------------------------
51 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
52   const int block_size = 512;
53   const CeedSize vec_size = length;
54   int grid_size = vec_size / block_size;
55 
56   if (block_size * grid_size < vec_size)
57     grid_size += 1;
58   rcpValueK<<<grid_size,block_size>>>(d_array, length);
59   return 0;
60 }
61 
62 //------------------------------------------------------------------------------
63 // Kernel for scale
64 //------------------------------------------------------------------------------
65 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
66     CeedSize size) {
67   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
68   if (index >= size)
69     return;
70   x[index] *= alpha;
71 }
72 
73 //------------------------------------------------------------------------------
74 // Compute x = alpha x on device
75 //------------------------------------------------------------------------------
76 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
77     CeedSize length) {
78   const int block_size = 512;
79   const CeedSize vec_size = length;
80   int grid_size = vec_size / block_size;
81 
82   if (block_size * grid_size < vec_size)
83     grid_size += 1;
84   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
85   return 0;
86 }
87 
88 //------------------------------------------------------------------------------
89 // Kernel for axpy
90 //------------------------------------------------------------------------------
91 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
92     CeedScalar * __restrict__ x, CeedSize size) {
93   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
94   if (index >= size)
95     return;
96   y[index] += alpha * x[index];
97 }
98 
99 //------------------------------------------------------------------------------
100 // Compute y = alpha x + y on device
101 //------------------------------------------------------------------------------
102 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
103     CeedScalar *x_array, CeedSize length) {
104   const int block_size = 512;
105   const CeedSize vec_size = length;
106   int grid_size = vec_size / block_size;
107 
108   if (block_size * grid_size < vec_size)
109     grid_size += 1;
110   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
111   return 0;
112 }
113 
114 //------------------------------------------------------------------------------
115 // Kernel for axpby
116 //------------------------------------------------------------------------------
117 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
118     CeedScalar * __restrict__ x, CeedSize size) {
119   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
120   if (index >= size)
121     return;
122   y[index] = beta * y[index];
123   y[index] += alpha * x[index];
124 }
125 
126 //------------------------------------------------------------------------------
127 // Compute y = alpha x + beta y on device
128 //------------------------------------------------------------------------------
129 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
130     CeedScalar *x_array, CeedSize length) {
131   const int block_size = 512;
132   const CeedSize vec_size = length;
133   int grid_size = vec_size / block_size;
134 
135   if (block_size * grid_size < vec_size)
136     grid_size += 1;
137   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
138   return 0;
139 }
140 
141 //------------------------------------------------------------------------------
142 // Kernel for pointwise mult
143 //------------------------------------------------------------------------------
144 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
145     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
146   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
147   if (index >= size)
148     return;
149   w[index] = x[index] * y[index];
150 }
151 
152 //------------------------------------------------------------------------------
153 // Compute the pointwise multiplication w = x .* y on device
154 //------------------------------------------------------------------------------
155 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
156     CeedScalar *y_array, CeedSize length) {
157   const int block_size = 512;
158   const CeedSize vec_size = length;
159   int grid_size = vec_size / block_size;
160 
161   if (block_size * grid_size < vec_size)
162     grid_size += 1;
163   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
164   return 0;
165 }
166 
167 //------------------------------------------------------------------------------
168