xref: /libCEED/backends/hip-ref/kernels/hip-ref-vector.hip.cpp (revision 9f99de848d0840355ad9548f806c3b5e4a3b9722)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <hip/hip_runtime.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for copy strided on device
13 //------------------------------------------------------------------------------
14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
15   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16 
17   if (index >= size) return;
18   if ((index - start) % step == 0) vec_copy[index] = vec[index];
19 }
20 
21 //------------------------------------------------------------------------------
22 // Copy strided on device memory
23 //------------------------------------------------------------------------------
24 extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
25   const int      block_size = 512;
26   const CeedSize vec_size   = length;
27   int            grid_size  = vec_size / block_size;
28 
29   if (block_size * grid_size < vec_size) grid_size += 1;
30   hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array);
31   return 0;
32 }
33 
34 //------------------------------------------------------------------------------
35 // Kernel for set value on device
36 //------------------------------------------------------------------------------
37 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
38   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
39 
40   if (index >= size) return;
41   vec[index] = val;
42 }
43 
44 //------------------------------------------------------------------------------
45 // Set value on device memory
46 //------------------------------------------------------------------------------
47 extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) {
48   const int      block_size = 512;
49   const CeedSize vec_size   = length;
50   int            grid_size  = vec_size / block_size;
51 
52   if (block_size * grid_size < vec_size) grid_size += 1;
53   hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val);
54   return 0;
55 }
56 
57 //------------------------------------------------------------------------------
58 // Kernel for set value strided on device
59 //------------------------------------------------------------------------------
60 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
61   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
62 
63   if (index >= size) return;
64   if ((index - start) % step == 0) vec[index] = val;
65 }
66 
67 //------------------------------------------------------------------------------
68 // Set value strided on device memory
69 //------------------------------------------------------------------------------
70 extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
71   const int      block_size = 512;
72   const CeedSize vec_size   = length;
73   int            grid_size  = vec_size / block_size;
74 
75   if (block_size * grid_size < vec_size) grid_size += 1;
76   hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val);
77   return 0;
78 }
79 
80 //------------------------------------------------------------------------------
81 // Kernel for taking reciprocal
82 //------------------------------------------------------------------------------
83 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
84   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
85 
86   if (index >= size) return;
87   if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
88 }
89 
90 //------------------------------------------------------------------------------
91 // Take vector reciprocal in device memory
92 //------------------------------------------------------------------------------
93 extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) {
94   const int      block_size = 512;
95   const CeedSize vec_size   = length;
96   int            grid_size  = vec_size / block_size;
97 
98   if (block_size * grid_size < vec_size) grid_size += 1;
99   hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length);
100   return 0;
101 }
102 
103 //------------------------------------------------------------------------------
104 // Kernel for scale
105 //------------------------------------------------------------------------------
106 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
107   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
108 
109   if (index >= size) return;
110   x[index] *= alpha;
111 }
112 
113 //------------------------------------------------------------------------------
114 // Compute x = alpha x on device
115 //------------------------------------------------------------------------------
116 extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
117   const int      block_size = 512;
118   const CeedSize vec_size   = length;
119   int            grid_size  = vec_size / block_size;
120 
121   if (block_size * grid_size < vec_size) grid_size += 1;
122   hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length);
123   return 0;
124 }
125 
126 //------------------------------------------------------------------------------
127 // Kernel for axpy
128 //------------------------------------------------------------------------------
129 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
130   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
131 
132   if (index >= size) return;
133   y[index] += alpha * x[index];
134 }
135 
136 //------------------------------------------------------------------------------
137 // Compute y = alpha x + y on device
138 //------------------------------------------------------------------------------
139 extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
140   const int      block_size = 512;
141   const CeedSize vec_size   = length;
142   int            grid_size  = vec_size / block_size;
143 
144   if (block_size * grid_size < vec_size) grid_size += 1;
145   hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length);
146   return 0;
147 }
148 
149 //------------------------------------------------------------------------------
150 // Kernel for axpby
151 //------------------------------------------------------------------------------
152 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
153   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
154 
155   if (index >= size) return;
156   y[index] = beta * y[index];
157   y[index] += alpha * x[index];
158 }
159 
160 //------------------------------------------------------------------------------
161 // Compute y = alpha x + beta y on device
162 //------------------------------------------------------------------------------
163 extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
164   const int      block_size = 512;
165   const CeedSize vec_size   = length;
166   int            grid_size  = vec_size / block_size;
167 
168   if (block_size * grid_size < vec_size) grid_size += 1;
169   hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length);
170   return 0;
171 }
172 
173 //------------------------------------------------------------------------------
174 // Kernel for pointwise mult
175 //------------------------------------------------------------------------------
176 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
177   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
178 
179   if (index >= size) return;
180   w[index] = x[index] * y[index];
181 }
182 
183 //------------------------------------------------------------------------------
184 // Compute the pointwise multiplication w = x .* y on device
185 //------------------------------------------------------------------------------
186 extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
187   const int      block_size = 512;
188   const CeedSize vec_size   = length;
189   int            grid_size  = vec_size / block_size;
190 
191   if (block_size * grid_size < vec_size) grid_size += 1;
192   hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length);
193   return 0;
194 }
195 
196 //------------------------------------------------------------------------------
197