xref: /libCEED/backends/hip-ref/kernels/hip-ref-vector.hip.cpp (revision 3196072fa7d47cffcf474f8b421f80aad90fa9c5)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <hip/hip_runtime.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
12*3196072fSJeremy L Thompson // Kernel for copy strided on device
13*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
14*3196072fSJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
15*3196072fSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16*3196072fSJeremy L Thompson 
17*3196072fSJeremy L Thompson   if (index >= size) return;
18*3196072fSJeremy L Thompson   if ((index - start) % step == 0) vec_copy[index] = vec[index];
19*3196072fSJeremy L Thompson }
20*3196072fSJeremy L Thompson 
21*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
22*3196072fSJeremy L Thompson // Copy strided on device memory
23*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
24*3196072fSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
25*3196072fSJeremy L Thompson   const int      block_size = 512;
26*3196072fSJeremy L Thompson   const CeedSize vec_size   = length;
27*3196072fSJeremy L Thompson   int            grid_size  = vec_size / block_size;
28*3196072fSJeremy L Thompson 
29*3196072fSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
30*3196072fSJeremy L Thompson   hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array);
31*3196072fSJeremy L Thompson   return 0;
32*3196072fSJeremy L Thompson }
33*3196072fSJeremy L Thompson 
34*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
350d0321e0SJeremy L Thompson // Kernel for set value on device
360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
379330daecSnbeams __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
38b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
39b7453713SJeremy L Thompson 
40b7453713SJeremy L Thompson   if (index >= size) return;
41b7453713SJeremy L Thompson   vec[index] = val;
420d0321e0SJeremy L Thompson }
430d0321e0SJeremy L Thompson 
440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
450d0321e0SJeremy L Thompson // Set value on device memory
460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
479330daecSnbeams extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) {
48b7453713SJeremy L Thompson   const int      block_size = 512;
49b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
50b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
510d0321e0SJeremy L Thompson 
52b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
53b7453713SJeremy L Thompson   hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val);
540d0321e0SJeremy L Thompson   return 0;
550d0321e0SJeremy L Thompson }
560d0321e0SJeremy L Thompson 
570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
58*3196072fSJeremy L Thompson // Kernel for set value strided on device
59*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
60*3196072fSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
61*3196072fSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
62*3196072fSJeremy L Thompson 
63*3196072fSJeremy L Thompson   if (index >= size) return;
64*3196072fSJeremy L Thompson   if ((index - start) % step == 0) vec[index] = val;
65*3196072fSJeremy L Thompson }
66*3196072fSJeremy L Thompson 
67*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
68*3196072fSJeremy L Thompson // Set value strided on device memory
69*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
70*3196072fSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
71*3196072fSJeremy L Thompson   const int      block_size = 512;
72*3196072fSJeremy L Thompson   const CeedSize vec_size   = length;
73*3196072fSJeremy L Thompson   int            grid_size  = vec_size / block_size;
74*3196072fSJeremy L Thompson 
75*3196072fSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
76*3196072fSJeremy L Thompson   hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val);
77*3196072fSJeremy L Thompson   return 0;
78*3196072fSJeremy L Thompson }
79*3196072fSJeremy L Thompson 
80*3196072fSJeremy L Thompson //------------------------------------------------------------------------------
810d0321e0SJeremy L Thompson // Kernel for taking reciprocal
820d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
839330daecSnbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
84b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
85b7453713SJeremy L Thompson 
86b7453713SJeremy L Thompson   if (index >= size) return;
87b7453713SJeremy L Thompson   if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
880d0321e0SJeremy L Thompson }
890d0321e0SJeremy L Thompson 
900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
910d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
939330daecSnbeams extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) {
94b7453713SJeremy L Thompson   const int      block_size = 512;
95b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
96b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
970d0321e0SJeremy L Thompson 
98b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
99b7453713SJeremy L Thompson   hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length);
1000d0321e0SJeremy L Thompson   return 0;
1010d0321e0SJeremy L Thompson }
1020d0321e0SJeremy L Thompson 
1030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1040d0321e0SJeremy L Thompson // Kernel for scale
1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1069330daecSnbeams __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
107b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
108b7453713SJeremy L Thompson 
109b7453713SJeremy L Thompson   if (index >= size) return;
110b7453713SJeremy L Thompson   x[index] *= alpha;
1110d0321e0SJeremy L Thompson }
1120d0321e0SJeremy L Thompson 
1130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1140d0321e0SJeremy L Thompson // Compute x = alpha x on device
1150d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1169330daecSnbeams extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
117b7453713SJeremy L Thompson   const int      block_size = 512;
118b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
119b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1200d0321e0SJeremy L Thompson 
121b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
122b7453713SJeremy L Thompson   hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length);
1230d0321e0SJeremy L Thompson   return 0;
1240d0321e0SJeremy L Thompson }
1250d0321e0SJeremy L Thompson 
1260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1270d0321e0SJeremy L Thompson // Kernel for axpy
1280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1299330daecSnbeams __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
130b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
131b7453713SJeremy L Thompson 
132b7453713SJeremy L Thompson   if (index >= size) return;
133b7453713SJeremy L Thompson   y[index] += alpha * x[index];
1340d0321e0SJeremy L Thompson }
1350d0321e0SJeremy L Thompson 
1360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1370d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1399330daecSnbeams extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
140b7453713SJeremy L Thompson   const int      block_size = 512;
141b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
142b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1430d0321e0SJeremy L Thompson 
144b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
145b7453713SJeremy L Thompson   hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length);
1460d0321e0SJeremy L Thompson   return 0;
1470d0321e0SJeremy L Thompson }
1480d0321e0SJeremy L Thompson 
1490d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1505fb68f37SKaren (Ren) Stengel // Kernel for axpby
1515fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1529330daecSnbeams __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
153b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
154b7453713SJeremy L Thompson 
155b7453713SJeremy L Thompson   if (index >= size) return;
156b7453713SJeremy L Thompson   y[index] = beta * y[index];
157b7453713SJeremy L Thompson   y[index] += alpha * x[index];
1585fb68f37SKaren (Ren) Stengel }
1595fb68f37SKaren (Ren) Stengel 
1605fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1615fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1625fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1639330daecSnbeams extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
164b7453713SJeremy L Thompson   const int      block_size = 512;
165b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
166b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1675fb68f37SKaren (Ren) Stengel 
168b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
169b7453713SJeremy L Thompson   hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length);
1705fb68f37SKaren (Ren) Stengel   return 0;
1715fb68f37SKaren (Ren) Stengel }
1725fb68f37SKaren (Ren) Stengel 
1735fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1740d0321e0SJeremy L Thompson // Kernel for pointwise mult
1750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1769330daecSnbeams __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
177b7453713SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
178b7453713SJeremy L Thompson 
179b7453713SJeremy L Thompson   if (index >= size) return;
180b7453713SJeremy L Thompson   w[index] = x[index] * y[index];
1810d0321e0SJeremy L Thompson }
1820d0321e0SJeremy L Thompson 
1830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1840d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1869330daecSnbeams extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
187b7453713SJeremy L Thompson   const int      block_size = 512;
188b7453713SJeremy L Thompson   const CeedSize vec_size   = length;
189b7453713SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1900d0321e0SJeremy L Thompson 
191b7453713SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
192b7453713SJeremy L Thompson   hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length);
1930d0321e0SJeremy L Thompson   return 0;
1940d0321e0SJeremy L Thompson }
1952a86cc9dSSebastian Grimberg 
1962a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
197