xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision f1c2287b836e28c06f03661cee66832fa5f0f99d)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
12*f1c2287bSJeremy L Thompson // Kernel for copy strided on device
13*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
14*f1c2287bSJeremy L Thompson __global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
15*f1c2287bSJeremy L Thompson                                     CeedSize size, CeedScalar * __restrict__ vec_copy) {
16*f1c2287bSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17*f1c2287bSJeremy L Thompson   if (index >= size) return;
18*f1c2287bSJeremy L Thompson   if ((index - start) % step == 0) vec_copy[index] = vec[index];
19*f1c2287bSJeremy L Thompson }
20*f1c2287bSJeremy L Thompson 
21*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
22*f1c2287bSJeremy L Thompson // Copy strided on device memory
23*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
24*f1c2287bSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
25*f1c2287bSJeremy L Thompson                                           CeedSize length, CeedScalar* d_copy_array) {
26*f1c2287bSJeremy L Thompson   const int block_size = 512;
27*f1c2287bSJeremy L Thompson   const CeedSize vec_size = length;
28*f1c2287bSJeremy L Thompson   int grid_size = vec_size / block_size;
29*f1c2287bSJeremy L Thompson 
30*f1c2287bSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
31*f1c2287bSJeremy L Thompson   copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array);
32*f1c2287bSJeremy L Thompson   return 0;
33*f1c2287bSJeremy L Thompson }
34*f1c2287bSJeremy L Thompson 
35*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
360d0321e0SJeremy L Thompson // Kernel for set value on device
370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
38f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
390d0321e0SJeremy L Thompson                                  CeedScalar val) {
40ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
419ef22048SJeremy L Thompson   if (index >= size) return;
42ca735530SJeremy L Thompson   vec[index] = val;
430d0321e0SJeremy L Thompson }
440d0321e0SJeremy L Thompson 
450d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
460d0321e0SJeremy L Thompson // Set value on device memory
470d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
48f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
490d0321e0SJeremy L Thompson                                        CeedScalar val) {
50ca735530SJeremy L Thompson   const int block_size = 512;
51ca735530SJeremy L Thompson   const CeedSize vec_size = length;
52ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
530d0321e0SJeremy L Thompson 
549ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
55ca735530SJeremy L Thompson   setValueK<<<grid_size,block_size>>>(d_array, length, val);
560d0321e0SJeremy L Thompson   return 0;
570d0321e0SJeremy L Thompson }
580d0321e0SJeremy L Thompson 
590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
60*f1c2287bSJeremy L Thompson // Kernel for set value strided on device
61*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
62*f1c2287bSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step,
63*f1c2287bSJeremy L Thompson                                         CeedSize size, CeedScalar val) {
64*f1c2287bSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
65*f1c2287bSJeremy L Thompson   if (index >= size) return;
66*f1c2287bSJeremy L Thompson   if ((index - start) % step == 0) vec[index] = val;
67*f1c2287bSJeremy L Thompson }
68*f1c2287bSJeremy L Thompson 
69*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
70*f1c2287bSJeremy L Thompson // Set value strided on device memory
71*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
72*f1c2287bSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step,
73*f1c2287bSJeremy L Thompson                                               CeedSize length, CeedScalar val) {
74*f1c2287bSJeremy L Thompson   const int block_size = 512;
75*f1c2287bSJeremy L Thompson   const CeedSize vec_size = length;
76*f1c2287bSJeremy L Thompson   int grid_size = vec_size / block_size;
77*f1c2287bSJeremy L Thompson 
78*f1c2287bSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
79*f1c2287bSJeremy L Thompson   setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val);
80*f1c2287bSJeremy L Thompson   return 0;
81*f1c2287bSJeremy L Thompson }
82*f1c2287bSJeremy L Thompson 
83*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
840d0321e0SJeremy L Thompson // Kernel for taking reciprocal
850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
86f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
87ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
889ef22048SJeremy L Thompson   if (index >= size) return;
899ef22048SJeremy L Thompson   if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
900d0321e0SJeremy L Thompson }
910d0321e0SJeremy L Thompson 
920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
930d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
95f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
96ca735530SJeremy L Thompson   const int block_size = 512;
97ca735530SJeremy L Thompson   const CeedSize vec_size = length;
98ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
990d0321e0SJeremy L Thompson 
1009ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
101ca735530SJeremy L Thompson   rcpValueK<<<grid_size,block_size>>>(d_array, length);
1020d0321e0SJeremy L Thompson   return 0;
1030d0321e0SJeremy L Thompson }
1040d0321e0SJeremy L Thompson 
1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1060d0321e0SJeremy L Thompson // Kernel for scale
1070d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1080d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
109f7c1b517Snbeams     CeedSize size) {
110ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1119ef22048SJeremy L Thompson   if (index >= size) return;
112ca735530SJeremy L Thompson   x[index] *= alpha;
1130d0321e0SJeremy L Thompson }
1140d0321e0SJeremy L Thompson 
1150d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1160d0321e0SJeremy L Thompson // Compute x = alpha x on device
1170d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1180d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
119f7c1b517Snbeams     CeedSize length) {
120ca735530SJeremy L Thompson   const int block_size = 512;
121ca735530SJeremy L Thompson   const CeedSize vec_size = length;
122ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1230d0321e0SJeremy L Thompson 
1249ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
125ca735530SJeremy L Thompson   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
1260d0321e0SJeremy L Thompson   return 0;
1270d0321e0SJeremy L Thompson }
1280d0321e0SJeremy L Thompson 
1290d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1300d0321e0SJeremy L Thompson // Kernel for axpy
1310d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1320d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
133f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
134ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1359ef22048SJeremy L Thompson   if (index >= size) return;
136ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1370d0321e0SJeremy L Thompson }
1380d0321e0SJeremy L Thompson 
1390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1400d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1410d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1420d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
143f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
144ca735530SJeremy L Thompson   const int block_size = 512;
145ca735530SJeremy L Thompson   const CeedSize vec_size = length;
146ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1470d0321e0SJeremy L Thompson 
1489ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
149ca735530SJeremy L Thompson   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
1500d0321e0SJeremy L Thompson   return 0;
1510d0321e0SJeremy L Thompson }
1520d0321e0SJeremy L Thompson 
1530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1545fb68f37SKaren (Ren) Stengel // Kernel for axpby
1555fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1565fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
157f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
158ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1599ef22048SJeremy L Thompson   if (index >= size) return;
160ca735530SJeremy L Thompson   y[index] = beta * y[index];
161ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1625fb68f37SKaren (Ren) Stengel }
1635fb68f37SKaren (Ren) Stengel 
1645fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1655fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1665fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1675fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
168f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
169ca735530SJeremy L Thompson   const int block_size = 512;
170ca735530SJeremy L Thompson   const CeedSize vec_size = length;
171ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1725fb68f37SKaren (Ren) Stengel 
1739ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
174ca735530SJeremy L Thompson   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
1755fb68f37SKaren (Ren) Stengel   return 0;
1765fb68f37SKaren (Ren) Stengel }
1775fb68f37SKaren (Ren) Stengel 
1785fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1790d0321e0SJeremy L Thompson // Kernel for pointwise mult
1800d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1810d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
182f7c1b517Snbeams     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
183ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1849ef22048SJeremy L Thompson   if (index >= size) return;
185ca735530SJeremy L Thompson   w[index] = x[index] * y[index];
1860d0321e0SJeremy L Thompson }
1870d0321e0SJeremy L Thompson 
1880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1890d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1910d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
192f7c1b517Snbeams     CeedScalar *y_array, CeedSize length) {
193ca735530SJeremy L Thompson   const int block_size = 512;
194ca735530SJeremy L Thompson   const CeedSize vec_size = length;
195ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1960d0321e0SJeremy L Thompson 
1979ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
198ca735530SJeremy L Thompson   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
1990d0321e0SJeremy L Thompson   return 0;
2000d0321e0SJeremy L Thompson }
2012a86cc9dSSebastian Grimberg 
2022a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
203