xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision f1c2287b836e28c06f03661cee66832fa5f0f99d)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <cuda.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for copy strided on device
13 //------------------------------------------------------------------------------
14 __global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
15                                     CeedSize size, CeedScalar * __restrict__ vec_copy) {
16   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17   if (index >= size) return;
18   if ((index - start) % step == 0) vec_copy[index] = vec[index];
19 }
20 
21 //------------------------------------------------------------------------------
22 // Copy strided on device memory
23 //------------------------------------------------------------------------------
24 extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
25                                           CeedSize length, CeedScalar* d_copy_array) {
26   const int block_size = 512;
27   const CeedSize vec_size = length;
28   int grid_size = vec_size / block_size;
29 
30   if (block_size * grid_size < vec_size) grid_size += 1;
31   copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array);
32   return 0;
33 }
34 
35 //------------------------------------------------------------------------------
36 // Kernel for set value on device
37 //------------------------------------------------------------------------------
38 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
39                                  CeedScalar val) {
40   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
41   if (index >= size) return;
42   vec[index] = val;
43 }
44 
45 //------------------------------------------------------------------------------
46 // Set value on device memory
47 //------------------------------------------------------------------------------
48 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
49                                        CeedScalar val) {
50   const int block_size = 512;
51   const CeedSize vec_size = length;
52   int grid_size = vec_size / block_size;
53 
54   if (block_size * grid_size < vec_size) grid_size += 1;
55   setValueK<<<grid_size,block_size>>>(d_array, length, val);
56   return 0;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Kernel for set value strided on device
61 //------------------------------------------------------------------------------
62 __global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
63                                         CeedSize size, CeedScalar val) {
64   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
65   if (index >= size) return;
66   if ((index - start) % step == 0) vec[index] = val;
67 }
68 
69 //------------------------------------------------------------------------------
70 // Set value strided on device memory
71 //------------------------------------------------------------------------------
72 extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
73                                               CeedSize length, CeedScalar val) {
74   const int block_size = 512;
75   const CeedSize vec_size = length;
76   int grid_size = vec_size / block_size;
77 
78   if (block_size * grid_size < vec_size) grid_size += 1;
79   setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val);
80   return 0;
81 }
82 
83 //------------------------------------------------------------------------------
84 // Kernel for taking reciprocal
85 //------------------------------------------------------------------------------
86 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
87   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
88   if (index >= size) return;
89   if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
90 }
91 
92 //------------------------------------------------------------------------------
93 // Take vector reciprocal in device memory
94 //------------------------------------------------------------------------------
95 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
96   const int block_size = 512;
97   const CeedSize vec_size = length;
98   int grid_size = vec_size / block_size;
99 
100   if (block_size * grid_size < vec_size) grid_size += 1;
101   rcpValueK<<<grid_size,block_size>>>(d_array, length);
102   return 0;
103 }
104 
105 //------------------------------------------------------------------------------
106 // Kernel for scale
107 //------------------------------------------------------------------------------
108 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
109     CeedSize size) {
110   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
111   if (index >= size) return;
112   x[index] *= alpha;
113 }
114 
115 //------------------------------------------------------------------------------
116 // Compute x = alpha x on device
117 //------------------------------------------------------------------------------
118 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
119     CeedSize length) {
120   const int block_size = 512;
121   const CeedSize vec_size = length;
122   int grid_size = vec_size / block_size;
123 
124   if (block_size * grid_size < vec_size) grid_size += 1;
125   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
126   return 0;
127 }
128 
129 //------------------------------------------------------------------------------
130 // Kernel for axpy
131 //------------------------------------------------------------------------------
132 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
133     CeedScalar * __restrict__ x, CeedSize size) {
134   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
135   if (index >= size) return;
136   y[index] += alpha * x[index];
137 }
138 
139 //------------------------------------------------------------------------------
140 // Compute y = alpha x + y on device
141 //------------------------------------------------------------------------------
142 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
143     CeedScalar *x_array, CeedSize length) {
144   const int block_size = 512;
145   const CeedSize vec_size = length;
146   int grid_size = vec_size / block_size;
147 
148   if (block_size * grid_size < vec_size) grid_size += 1;
149   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
150   return 0;
151 }
152 
153 //------------------------------------------------------------------------------
154 // Kernel for axpby
155 //------------------------------------------------------------------------------
156 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
157     CeedScalar * __restrict__ x, CeedSize size) {
158   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
159   if (index >= size) return;
160   y[index] = beta * y[index];
161   y[index] += alpha * x[index];
162 }
163 
164 //------------------------------------------------------------------------------
165 // Compute y = alpha x + beta y on device
166 //------------------------------------------------------------------------------
167 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
168     CeedScalar *x_array, CeedSize length) {
169   const int block_size = 512;
170   const CeedSize vec_size = length;
171   int grid_size = vec_size / block_size;
172 
173   if (block_size * grid_size < vec_size) grid_size += 1;
174   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
175   return 0;
176 }
177 
178 //------------------------------------------------------------------------------
179 // Kernel for pointwise mult
180 //------------------------------------------------------------------------------
181 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
182     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
183   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
184   if (index >= size) return;
185   w[index] = x[index] * y[index];
186 }
187 
188 //------------------------------------------------------------------------------
189 // Compute the pointwise multiplication w = x .* y on device
190 //------------------------------------------------------------------------------
191 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
192     CeedScalar *y_array, CeedSize length) {
193   const int block_size = 512;
194   const CeedSize vec_size = length;
195   int grid_size = vec_size / block_size;
196 
197   if (block_size * grid_size < vec_size) grid_size += 1;
198   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
199   return 0;
200 }
201 
202 //------------------------------------------------------------------------------
203