xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 7d5185d77fd3980a37189da5411f3a999b5628b9)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <cuda.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for copy strided on device
13 //------------------------------------------------------------------------------
14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
15   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16   if (index >= size) return;
17   if ((index - start) % step == 0) vec_copy[index] = vec[index];
18 }
19 
20 //------------------------------------------------------------------------------
21 // Copy strided on device memory
22 //------------------------------------------------------------------------------
23 extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
24   const int      block_size = 512;
25   const CeedSize vec_size   = length;
26   int            grid_size  = vec_size / block_size;
27 
28   if (block_size * grid_size < vec_size) grid_size += 1;
29   copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array);
30   return 0;
31 }
32 
33 //------------------------------------------------------------------------------
34 // Kernel for set value on device
35 //------------------------------------------------------------------------------
36 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
37   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
38   if (index >= size) return;
39   vec[index] = val;
40 }
41 
42 //------------------------------------------------------------------------------
43 // Set value on device memory
44 //------------------------------------------------------------------------------
45 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
46   const int      block_size = 512;
47   const CeedSize vec_size   = length;
48   int            grid_size  = vec_size / block_size;
49 
50   if (block_size * grid_size < vec_size) grid_size += 1;
51   setValueK<<<grid_size, block_size>>>(d_array, length, val);
52   return 0;
53 }
54 
55 //------------------------------------------------------------------------------
56 // Kernel for set value strided on device
57 //------------------------------------------------------------------------------
58 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
59   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
60   if (index >= size) return;
61   if ((index - start) % step == 0) vec[index] = val;
62 }
63 
64 //------------------------------------------------------------------------------
65 // Set value strided on device memory
66 //------------------------------------------------------------------------------
67 extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
68   const int      block_size = 512;
69   const CeedSize vec_size   = length;
70   int            grid_size  = vec_size / block_size;
71 
72   if (block_size * grid_size < vec_size) grid_size += 1;
73   setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val);
74   return 0;
75 }
76 
77 //------------------------------------------------------------------------------
78 // Kernel for taking reciprocal
79 //------------------------------------------------------------------------------
80 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
81   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
82   if (index >= size) return;
83   if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
84 }
85 
86 //------------------------------------------------------------------------------
87 // Take vector reciprocal in device memory
88 //------------------------------------------------------------------------------
89 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
90   const int      block_size = 512;
91   const CeedSize vec_size   = length;
92   int            grid_size  = vec_size / block_size;
93 
94   if (block_size * grid_size < vec_size) grid_size += 1;
95   rcpValueK<<<grid_size, block_size>>>(d_array, length);
96   return 0;
97 }
98 
99 //------------------------------------------------------------------------------
100 // Kernel for scale
101 //------------------------------------------------------------------------------
102 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
103   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
104   if (index >= size) return;
105   x[index] *= alpha;
106 }
107 
108 //------------------------------------------------------------------------------
109 // Compute x = alpha x on device
110 //------------------------------------------------------------------------------
111 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
112   const int      block_size = 512;
113   const CeedSize vec_size   = length;
114   int            grid_size  = vec_size / block_size;
115 
116   if (block_size * grid_size < vec_size) grid_size += 1;
117   scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
118   return 0;
119 }
120 
121 //------------------------------------------------------------------------------
122 // Kernel for axpy
123 //------------------------------------------------------------------------------
124 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
125   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
126   if (index >= size) return;
127   y[index] += alpha * x[index];
128 }
129 
130 //------------------------------------------------------------------------------
131 // Compute y = alpha x + y on device
132 //------------------------------------------------------------------------------
133 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
134   const int      block_size = 512;
135   const CeedSize vec_size   = length;
136   int            grid_size  = vec_size / block_size;
137 
138   if (block_size * grid_size < vec_size) grid_size += 1;
139   axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
140   return 0;
141 }
142 
143 //------------------------------------------------------------------------------
144 // Kernel for axpby
145 //------------------------------------------------------------------------------
146 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
147   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
148   if (index >= size) return;
149   y[index] = beta * y[index];
150   y[index] += alpha * x[index];
151 }
152 
153 //------------------------------------------------------------------------------
154 // Compute y = alpha x + beta y on device
155 //------------------------------------------------------------------------------
156 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
157   const int      block_size = 512;
158   const CeedSize vec_size   = length;
159   int            grid_size  = vec_size / block_size;
160 
161   if (block_size * grid_size < vec_size) grid_size += 1;
162   axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
163   return 0;
164 }
165 
166 //------------------------------------------------------------------------------
167 // Kernel for pointwise mult
168 //------------------------------------------------------------------------------
169 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
170   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
171   if (index >= size) return;
172   w[index] = x[index] * y[index];
173 }
174 
175 //------------------------------------------------------------------------------
176 // Compute the pointwise multiplication w = x .* y on device
177 //------------------------------------------------------------------------------
178 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
179   const int      block_size = 512;
180   const CeedSize vec_size   = length;
181   int            grid_size  = vec_size / block_size;
182 
183   if (block_size * grid_size < vec_size) grid_size += 1;
184   pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
185   return 0;
186 }
187 
188 //------------------------------------------------------------------------------
189