xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision d4cc18453651bd0f94c1a2e078b2646a92dafdcc)
1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
12f1c2287bSJeremy L Thompson // Kernel for copy strided on device
13f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
copyStridedK(CeedScalar * __restrict__ vec,CeedSize start,CeedSize stop,CeedSize step,CeedScalar * __restrict__ vec_copy)14832a6d73SJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) {
15c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16c9d5affaSJeremy L Thompson 
17832a6d73SJeremy L Thompson   if (index < stop - start) {
18832a6d73SJeremy L Thompson     if (index % step == 0) vec_copy[start + index] = vec[start + index];
19f1c2287bSJeremy L Thompson   }
20c9d5affaSJeremy L Thompson }
21f1c2287bSJeremy L Thompson 
22f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
23f1c2287bSJeremy L Thompson // Copy strided on device memory
24f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceCopyStrided_Cuda(CeedScalar * d_array,CeedSize start,CeedSize stop,CeedSize step,CeedScalar * d_copy_array)25832a6d73SJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) {
26f1c2287bSJeremy L Thompson   const int      block_size = 512;
27832a6d73SJeremy L Thompson   const CeedSize copy_size  = stop - start;
28832a6d73SJeremy L Thompson   int            grid_size  = copy_size / block_size;
29f1c2287bSJeremy L Thompson 
30832a6d73SJeremy L Thompson   if (block_size * grid_size < copy_size) grid_size += 1;
31832a6d73SJeremy L Thompson   copyStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, d_copy_array);
32f1c2287bSJeremy L Thompson   return 0;
33f1c2287bSJeremy L Thompson }
34f1c2287bSJeremy L Thompson 
35f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
360d0321e0SJeremy L Thompson // Kernel for set value on device
370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
setValueK(CeedScalar * __restrict__ vec,CeedSize size,CeedScalar val)38b73fa92cSJeremy L Thompson __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
39c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
40c9d5affaSJeremy L Thompson 
41c9d5affaSJeremy L Thompson   if (index < size) vec[index] = val;
420d0321e0SJeremy L Thompson }
430d0321e0SJeremy L Thompson 
440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
450d0321e0SJeremy L Thompson // Set value on device memory
460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceSetValue_Cuda(CeedScalar * d_array,CeedSize length,CeedScalar val)47b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
48ca735530SJeremy L Thompson   const int      block_size = 512;
49ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
50ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
510d0321e0SJeremy L Thompson 
529ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
53ca735530SJeremy L Thompson   setValueK<<<grid_size, block_size>>>(d_array, length, val);
540d0321e0SJeremy L Thompson   return 0;
550d0321e0SJeremy L Thompson }
560d0321e0SJeremy L Thompson 
570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
58f1c2287bSJeremy L Thompson // Kernel for set value strided on device
59f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
setValueStridedK(CeedScalar * __restrict__ vec,CeedSize start,CeedSize stop,CeedSize step,CeedScalar val)602d73a370SJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
61c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
62c9d5affaSJeremy L Thompson 
6314c82621SJeremy L Thompson   if (index < stop - start) {
6414c82621SJeremy L Thompson     if (index % step == 0) vec[start + index] = val;
65f1c2287bSJeremy L Thompson   }
66c9d5affaSJeremy L Thompson }
67f1c2287bSJeremy L Thompson 
68f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
69f1c2287bSJeremy L Thompson // Set value strided on device memory
70f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceSetValueStrided_Cuda(CeedScalar * d_array,CeedSize start,CeedSize stop,CeedSize step,CeedScalar val)7114c82621SJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) {
72f1c2287bSJeremy L Thompson   const int      block_size = 512;
7314c82621SJeremy L Thompson   const CeedSize set_size   = stop - start;
7414c82621SJeremy L Thompson   int            grid_size  = set_size / block_size;
75f1c2287bSJeremy L Thompson 
7614c82621SJeremy L Thompson   if (block_size * grid_size < set_size) grid_size += 1;
772d73a370SJeremy L Thompson   setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val);
78f1c2287bSJeremy L Thompson   return 0;
79f1c2287bSJeremy L Thompson }
80f1c2287bSJeremy L Thompson 
81f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
820d0321e0SJeremy L Thompson // Kernel for taking reciprocal
830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
rcpValueK(CeedScalar * __restrict__ vec,CeedSize size)84f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
85c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
86c9d5affaSJeremy L Thompson 
87c9d5affaSJeremy L Thompson   if (index < size) {
889ef22048SJeremy L Thompson     if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
890d0321e0SJeremy L Thompson   }
90c9d5affaSJeremy L Thompson }
910d0321e0SJeremy L Thompson 
920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
930d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceReciprocal_Cuda(CeedScalar * d_array,CeedSize length)95f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
96ca735530SJeremy L Thompson   const int      block_size = 512;
97ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
98ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
990d0321e0SJeremy L Thompson 
1009ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
101ca735530SJeremy L Thompson   rcpValueK<<<grid_size, block_size>>>(d_array, length);
1020d0321e0SJeremy L Thompson   return 0;
1030d0321e0SJeremy L Thompson }
1040d0321e0SJeremy L Thompson 
1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1060d0321e0SJeremy L Thompson // Kernel for scale
1070d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
scaleValueK(CeedScalar * __restrict__ x,CeedScalar alpha,CeedSize size)108b73fa92cSJeremy L Thompson __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
109c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
110c9d5affaSJeremy L Thompson 
111c9d5affaSJeremy L Thompson   if (index < size) x[index] *= alpha;
1120d0321e0SJeremy L Thompson }
1130d0321e0SJeremy L Thompson 
1140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1150d0321e0SJeremy L Thompson // Compute x = alpha x on device
1160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceScale_Cuda(CeedScalar * x_array,CeedScalar alpha,CeedSize length)117b73fa92cSJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
118ca735530SJeremy L Thompson   const int      block_size = 512;
119ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
120ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1210d0321e0SJeremy L Thompson 
1229ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
123ca735530SJeremy L Thompson   scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
1240d0321e0SJeremy L Thompson   return 0;
1250d0321e0SJeremy L Thompson }
1260d0321e0SJeremy L Thompson 
1270d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1280d0321e0SJeremy L Thompson // Kernel for axpy
1290d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
axpyValueK(CeedScalar * __restrict__ y,CeedScalar alpha,CeedScalar * __restrict__ x,CeedSize size)130b73fa92cSJeremy L Thompson __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
131c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
132c9d5affaSJeremy L Thompson 
133c9d5affaSJeremy L Thompson   if (index < size) y[index] += alpha * x[index];
1340d0321e0SJeremy L Thompson }
1350d0321e0SJeremy L Thompson 
1360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1370d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
CeedDeviceAXPY_Cuda(CeedScalar * y_array,CeedScalar alpha,CeedScalar * x_array,CeedSize length)139b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
140ca735530SJeremy L Thompson   const int      block_size = 512;
141ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
142ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1430d0321e0SJeremy L Thompson 
1449ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
145ca735530SJeremy L Thompson   axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
1460d0321e0SJeremy L Thompson   return 0;
1470d0321e0SJeremy L Thompson }
1480d0321e0SJeremy L Thompson 
1490d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1505fb68f37SKaren (Ren) Stengel // Kernel for axpby
1515fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
axpbyValueK(CeedScalar * __restrict__ y,CeedScalar alpha,CeedScalar beta,CeedScalar * __restrict__ x,CeedSize size)152b73fa92cSJeremy L Thompson __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
153c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
154c9d5affaSJeremy L Thompson 
155c9d5affaSJeremy L Thompson   if (index < size) {
156ca735530SJeremy L Thompson     y[index] = beta * y[index];
157ca735530SJeremy L Thompson     y[index] += alpha * x[index];
1585fb68f37SKaren (Ren) Stengel   }
159c9d5affaSJeremy L Thompson }
1605fb68f37SKaren (Ren) Stengel 
1615fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1625fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1635fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
CeedDeviceAXPBY_Cuda(CeedScalar * y_array,CeedScalar alpha,CeedScalar beta,CeedScalar * x_array,CeedSize length)164b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
165ca735530SJeremy L Thompson   const int      block_size = 512;
166ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
167ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1685fb68f37SKaren (Ren) Stengel 
1699ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
170ca735530SJeremy L Thompson   axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
1715fb68f37SKaren (Ren) Stengel   return 0;
1725fb68f37SKaren (Ren) Stengel }
1735fb68f37SKaren (Ren) Stengel 
1745fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1750d0321e0SJeremy L Thompson // Kernel for pointwise mult
1760d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
pointwiseMultValueK(CeedScalar * __restrict__ w,CeedScalar * x,CeedScalar * __restrict__ y,CeedSize size)177b73fa92cSJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
178c9d5affaSJeremy L Thompson   const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
179c9d5affaSJeremy L Thompson 
180c9d5affaSJeremy L Thompson   if (index < size) w[index] = x[index] * y[index];
1810d0321e0SJeremy L Thompson }
1820d0321e0SJeremy L Thompson 
1830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1840d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
CeedDevicePointwiseMult_Cuda(CeedScalar * w_array,CeedScalar * x_array,CeedScalar * y_array,CeedSize length)186b73fa92cSJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
187ca735530SJeremy L Thompson   const int      block_size = 512;
188ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
189ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1900d0321e0SJeremy L Thompson 
1919ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
192ca735530SJeremy L Thompson   pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
1930d0321e0SJeremy L Thompson   return 0;
1940d0321e0SJeremy L Thompson }
1952a86cc9dSSebastian Grimberg 
1962a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
197