xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 9ef220489445f31398d6e42662820ac48a00a591)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
120d0321e0SJeremy L Thompson // Kernel for set value on device
130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
14f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
150d0321e0SJeremy L Thompson                                  CeedScalar val) {
16ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17*9ef22048SJeremy L Thompson   if (index >= size) return;
18ca735530SJeremy L Thompson   vec[index] = val;
190d0321e0SJeremy L Thompson }
200d0321e0SJeremy L Thompson 
210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
220d0321e0SJeremy L Thompson // Set value on device memory
230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
24f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
250d0321e0SJeremy L Thompson                                        CeedScalar val) {
26ca735530SJeremy L Thompson   const int block_size = 512;
27ca735530SJeremy L Thompson   const CeedSize vec_size = length;
28ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
290d0321e0SJeremy L Thompson 
30*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
31ca735530SJeremy L Thompson   setValueK<<<grid_size,block_size>>>(d_array, length, val);
320d0321e0SJeremy L Thompson   return 0;
330d0321e0SJeremy L Thompson }
340d0321e0SJeremy L Thompson 
350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
360d0321e0SJeremy L Thompson // Kernel for taking reciprocal
370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
38f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
39ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
40*9ef22048SJeremy L Thompson   if (index >= size) return;
41*9ef22048SJeremy L Thompson   if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
420d0321e0SJeremy L Thompson }
430d0321e0SJeremy L Thompson 
440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
450d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
47f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
48ca735530SJeremy L Thompson   const int block_size = 512;
49ca735530SJeremy L Thompson   const CeedSize vec_size = length;
50ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
510d0321e0SJeremy L Thompson 
52*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
53ca735530SJeremy L Thompson   rcpValueK<<<grid_size,block_size>>>(d_array, length);
540d0321e0SJeremy L Thompson   return 0;
550d0321e0SJeremy L Thompson }
560d0321e0SJeremy L Thompson 
570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
580d0321e0SJeremy L Thompson // Kernel for scale
590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
600d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
61f7c1b517Snbeams     CeedSize size) {
62ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
63*9ef22048SJeremy L Thompson   if (index >= size) return;
64ca735530SJeremy L Thompson   x[index] *= alpha;
650d0321e0SJeremy L Thompson }
660d0321e0SJeremy L Thompson 
670d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
680d0321e0SJeremy L Thompson // Compute x = alpha x on device
690d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
700d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
71f7c1b517Snbeams     CeedSize length) {
72ca735530SJeremy L Thompson   const int block_size = 512;
73ca735530SJeremy L Thompson   const CeedSize vec_size = length;
74ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
750d0321e0SJeremy L Thompson 
76*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
77ca735530SJeremy L Thompson   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
780d0321e0SJeremy L Thompson   return 0;
790d0321e0SJeremy L Thompson }
800d0321e0SJeremy L Thompson 
810d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
820d0321e0SJeremy L Thompson // Kernel for axpy
830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
840d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
85f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
86ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
87*9ef22048SJeremy L Thompson   if (index >= size) return;
88ca735530SJeremy L Thompson   y[index] += alpha * x[index];
890d0321e0SJeremy L Thompson }
900d0321e0SJeremy L Thompson 
910d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
920d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
930d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
940d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
95f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
96ca735530SJeremy L Thompson   const int block_size = 512;
97ca735530SJeremy L Thompson   const CeedSize vec_size = length;
98ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
990d0321e0SJeremy L Thompson 
100*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
101ca735530SJeremy L Thompson   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
1020d0321e0SJeremy L Thompson   return 0;
1030d0321e0SJeremy L Thompson }
1040d0321e0SJeremy L Thompson 
1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1065fb68f37SKaren (Ren) Stengel // Kernel for axpby
1075fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1085fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
109f7c1b517Snbeams     CeedScalar * __restrict__ x, CeedSize size) {
110ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
111*9ef22048SJeremy L Thompson   if (index >= size) return;
112ca735530SJeremy L Thompson   y[index] = beta * y[index];
113ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1145fb68f37SKaren (Ren) Stengel }
1155fb68f37SKaren (Ren) Stengel 
1165fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1175fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1185fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1195fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
120f7c1b517Snbeams     CeedScalar *x_array, CeedSize length) {
121ca735530SJeremy L Thompson   const int block_size = 512;
122ca735530SJeremy L Thompson   const CeedSize vec_size = length;
123ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1245fb68f37SKaren (Ren) Stengel 
125*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
126ca735530SJeremy L Thompson   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
1275fb68f37SKaren (Ren) Stengel   return 0;
1285fb68f37SKaren (Ren) Stengel }
1295fb68f37SKaren (Ren) Stengel 
1305fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1310d0321e0SJeremy L Thompson // Kernel for pointwise mult
1320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1330d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
134f7c1b517Snbeams     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
135ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
136*9ef22048SJeremy L Thompson   if (index >= size) return;
137ca735530SJeremy L Thompson   w[index] = x[index] * y[index];
1380d0321e0SJeremy L Thompson }
1390d0321e0SJeremy L Thompson 
1400d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1410d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1420d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1430d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
144f7c1b517Snbeams     CeedScalar *y_array, CeedSize length) {
145ca735530SJeremy L Thompson   const int block_size = 512;
146ca735530SJeremy L Thompson   const CeedSize vec_size = length;
147ca735530SJeremy L Thompson   int grid_size = vec_size / block_size;
1480d0321e0SJeremy L Thompson 
149*9ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
150ca735530SJeremy L Thompson   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
1510d0321e0SJeremy L Thompson   return 0;
1520d0321e0SJeremy L Thompson }
1532a86cc9dSSebastian Grimberg 
1542a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
155