xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 5aed82e4fa97acf4ba24a7f10a35f5303a6798e0)
1*5aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
120d0321e0SJeremy L Thompson // Kernel for set value on device
130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
14f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
150d0321e0SJeremy L Thompson                                  CeedScalar val) {
16ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17ca735530SJeremy L Thompson   if (index >= size)
180d0321e0SJeremy L Thompson     return;
19ca735530SJeremy L Thompson   vec[index] = val;
200d0321e0SJeremy L Thompson }
210d0321e0SJeremy L Thompson 
220d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
230d0321e0SJeremy L Thompson // Set value on device memory
240d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
25f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
260d0321e0SJeremy L Thompson                                        CeedScalar val) {
27ca735530SJeremy L Thompson   const int block_size = 512;
28ca735530SJeremy L Thompson   const CeedSize vec_size = length;
29ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
300d0321e0SJeremy L Thompson 
31ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
32ca735530SJeremy L Thompson     grid_size += 1;
33ca735530SJeremy L Thompson   setValueK<<<grid_size,block_size>>>(d_array, length, val);
340d0321e0SJeremy L Thompson   return 0;
350d0321e0SJeremy L Thompson }
360d0321e0SJeremy L Thompson 
370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
380d0321e0SJeremy L Thompson // Kernel for taking reciprocal
390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
40f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
41ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
42ca735530SJeremy L Thompson   if (index >= size)
430d0321e0SJeremy L Thompson     return;
44ca735530SJeremy L Thompson   if (fabs(vec[index]) > 1E-16)
45ca735530SJeremy L Thompson     vec[index] = 1./vec[index];
460d0321e0SJeremy L Thompson }
470d0321e0SJeremy L Thompson 
480d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
490d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
500d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
51f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
52ca735530SJeremy L Thompson   const int block_size = 512;
53ca735530SJeremy L Thompson   const CeedSize vec_size = length;
54ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
550d0321e0SJeremy L Thompson 
56ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
57ca735530SJeremy L Thompson     grid_size += 1;
58ca735530SJeremy L Thompson   rcpValueK<<<grid_size,block_size>>>(d_array, length);
590d0321e0SJeremy L Thompson   return 0;
600d0321e0SJeremy L Thompson }
610d0321e0SJeremy L Thompson 
620d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
630d0321e0SJeremy L Thompson // Kernel for scale
640d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
650d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
66f7c1b517Snbeams     CeedSize size) {
67ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
68ca735530SJeremy L Thompson   if (index >= size)
690d0321e0SJeremy L Thompson     return;
70ca735530SJeremy L Thompson   x[index] *= alpha;
710d0321e0SJeremy L Thompson }
720d0321e0SJeremy L Thompson 
730d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
740d0321e0SJeremy L Thompson // Compute x = alpha x on device
750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
760d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
77f7c1b517Snbeams     CeedSize length) {
78ca735530SJeremy L Thompson   const int block_size = 512;
79ca735530SJeremy L Thompson   const CeedSize vec_size = length;
80ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
810d0321e0SJeremy L Thompson 
82ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
83ca735530SJeremy L Thompson     grid_size += 1;
84ca735530SJeremy L Thompson   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
850d0321e0SJeremy L Thompson   return 0;
860d0321e0SJeremy L Thompson }
870d0321e0SJeremy L Thompson 
880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
890d0321e0SJeremy L Thompson // Kernel for axpy
900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
910d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
92f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
93ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
94ca735530SJeremy L Thompson   if (index >= size)
950d0321e0SJeremy L Thompson     return;
96ca735530SJeremy L Thompson   y[index] += alpha * x[index];
970d0321e0SJeremy L Thompson }
980d0321e0SJeremy L Thompson 
990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1000d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1020d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
103f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
104ca735530SJeremy L Thompson   const int block_size = 512;
105ca735530SJeremy L Thompson   const CeedSize vec_size = length;
106ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1070d0321e0SJeremy L Thompson 
108ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
109ca735530SJeremy L Thompson     grid_size += 1;
110ca735530SJeremy L Thompson   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
1110d0321e0SJeremy L Thompson   return 0;
1120d0321e0SJeremy L Thompson }
1130d0321e0SJeremy L Thompson 
1140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1155fb68f37SKaren (Ren) Stengel // Kernel for axpby
1165fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1175fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
118f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
119ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
120ca735530SJeremy L Thompson   if (index >= size)
1215fb68f37SKaren (Ren) Stengel     return;
122ca735530SJeremy L Thompson   y[index] = beta * y[index];
123ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1245fb68f37SKaren (Ren) Stengel }
1255fb68f37SKaren (Ren) Stengel 
1265fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1275fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1285fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1295fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
130f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
131ca735530SJeremy L Thompson   const int block_size = 512;
132ca735530SJeremy L Thompson   const CeedSize vec_size = length;
133ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1345fb68f37SKaren (Ren) Stengel 
135ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
136ca735530SJeremy L Thompson     grid_size += 1;
137ca735530SJeremy L Thompson   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
1385fb68f37SKaren (Ren) Stengel   return 0;
1395fb68f37SKaren (Ren) Stengel }
1405fb68f37SKaren (Ren) Stengel 
1415fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1420d0321e0SJeremy L Thompson // Kernel for pointwise mult
1430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1440d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
145f7c1b517Snbeams     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
146ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
147ca735530SJeremy L Thompson   if (index >= size)
1480d0321e0SJeremy L Thompson     return;
149ca735530SJeremy L Thompson   w[index] = x[index] * y[index];
1500d0321e0SJeremy L Thompson }
1510d0321e0SJeremy L Thompson 
1520d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1530d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1550d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
156f7c1b517Snbeams     CeedScalar *y_array, CeedSize length) {
157ca735530SJeremy L Thompson   const int block_size = 512;
158ca735530SJeremy L Thompson   const CeedSize vec_size = length;
159ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1600d0321e0SJeremy L Thompson 
161ca735530SJeremy L Thompson   if (block_size * grid_size < vec_size)
162ca735530SJeremy L Thompson     grid_size += 1;
163ca735530SJeremy L Thompson   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
1640d0321e0SJeremy L Thompson   return 0;
1650d0321e0SJeremy L Thompson }
1662a86cc9dSSebastian Grimberg 
1672a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
168