xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision d538d163358b73723887a2d3949507319f119601)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <cuda.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for copy strided on device
13 //------------------------------------------------------------------------------
14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) {
15   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16 
17   if (index < stop - start) {
18     if (index % step == 0) vec_copy[start + index] = vec[start + index];
19   }
20 }
21 
22 //------------------------------------------------------------------------------
23 // Copy strided on device memory
24 //------------------------------------------------------------------------------
25 extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) {
26   const int      block_size = 512;
27   const CeedSize copy_size  = stop - start;
28   int            grid_size  = copy_size / block_size;
29 
30   if (block_size * grid_size < copy_size) grid_size += 1;
31   copyStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, d_copy_array);
32   return 0;
33 }
34 
35 //------------------------------------------------------------------------------
36 // Kernel for set value on device
37 //------------------------------------------------------------------------------
38 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
39   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
40 
41   if (index < size) vec[index] = val;
42 }
43 
44 //------------------------------------------------------------------------------
45 // Set value on device memory
46 //------------------------------------------------------------------------------
47 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
48   const int      block_size = 512;
49   const CeedSize vec_size   = length;
50   int            grid_size  = vec_size / block_size;
51 
52   if (block_size * grid_size < vec_size) grid_size += 1;
53   setValueK<<<grid_size, block_size>>>(d_array, length, val);
54   return 0;
55 }
56 
57 //------------------------------------------------------------------------------
58 // Kernel for set value strided on device
59 //------------------------------------------------------------------------------
60 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
61   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
62 
63   if (index < stop - start) {
64     if (index % step == 0) vec[start + index] = val;
65   }
66 }
67 
68 //------------------------------------------------------------------------------
69 // Set value strided on device memory
70 //------------------------------------------------------------------------------
71 extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
72   const int      block_size = 512;
73   const CeedSize set_size   = stop - start;
74   int            grid_size  = set_size / block_size;
75 
76   if (block_size * grid_size < set_size) grid_size += 1;
77   setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val);
78   return 0;
79 }
80 
81 //------------------------------------------------------------------------------
82 // Kernel for taking reciprocal
83 //------------------------------------------------------------------------------
84 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
85   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
86 
87   if (index < size) {
88     if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
89   }
90 }
91 
92 //------------------------------------------------------------------------------
93 // Take vector reciprocal in device memory
94 //------------------------------------------------------------------------------
95 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
96   const int      block_size = 512;
97   const CeedSize vec_size   = length;
98   int            grid_size  = vec_size / block_size;
99 
100   if (block_size * grid_size < vec_size) grid_size += 1;
101   rcpValueK<<<grid_size, block_size>>>(d_array, length);
102   return 0;
103 }
104 
105 //------------------------------------------------------------------------------
106 // Kernel for scale
107 //------------------------------------------------------------------------------
108 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
109   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
110 
111   if (index < size) x[index] *= alpha;
112 }
113 
114 //------------------------------------------------------------------------------
115 // Compute x = alpha x on device
116 //------------------------------------------------------------------------------
117 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
118   const int      block_size = 512;
119   const CeedSize vec_size   = length;
120   int            grid_size  = vec_size / block_size;
121 
122   if (block_size * grid_size < vec_size) grid_size += 1;
123   scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
124   return 0;
125 }
126 
127 //------------------------------------------------------------------------------
128 // Kernel for axpy
129 //------------------------------------------------------------------------------
130 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
131   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
132 
133   if (index < size) y[index] += alpha * x[index];
134 }
135 
136 //------------------------------------------------------------------------------
137 // Compute y = alpha x + y on device
138 //------------------------------------------------------------------------------
139 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
140   const int      block_size = 512;
141   const CeedSize vec_size   = length;
142   int            grid_size  = vec_size / block_size;
143 
144   if (block_size * grid_size < vec_size) grid_size += 1;
145   axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
146   return 0;
147 }
148 
149 //------------------------------------------------------------------------------
150 // Kernel for axpby
151 //------------------------------------------------------------------------------
152 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
153   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
154 
155   if (index < size) {
156     y[index] = beta * y[index];
157     y[index] += alpha * x[index];
158   }
159 }
160 
161 //------------------------------------------------------------------------------
162 // Compute y = alpha x + beta y on device
163 //------------------------------------------------------------------------------
164 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
165   const int      block_size = 512;
166   const CeedSize vec_size   = length;
167   int            grid_size  = vec_size / block_size;
168 
169   if (block_size * grid_size < vec_size) grid_size += 1;
170   axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
171   return 0;
172 }
173 
174 //------------------------------------------------------------------------------
175 // Kernel for pointwise mult
176 //------------------------------------------------------------------------------
177 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
178   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
179 
180   if (index < size) w[index] = x[index] * y[index];
181 }
182 
183 //------------------------------------------------------------------------------
184 // Compute the pointwise multiplication w = x .* y on device
185 //------------------------------------------------------------------------------
186 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
187   const int      block_size = 512;
188   const CeedSize vec_size   = length;
189   int            grid_size  = vec_size / block_size;
190 
191   if (block_size * grid_size < vec_size) grid_size += 1;
192   pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
193   return 0;
194 }
195 
196 //------------------------------------------------------------------------------
197