xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 9ef220489445f31398d6e42662820ac48a00a591)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <cuda.h>
10 
11 //------------------------------------------------------------------------------
12 // Kernel for set value on device
13 //------------------------------------------------------------------------------
14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size,
15                                  CeedScalar val) {
16   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
17   if (index >= size) return;
18   vec[index] = val;
19 }
20 
21 //------------------------------------------------------------------------------
22 // Set value on device memory
23 //------------------------------------------------------------------------------
24 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length,
25                                        CeedScalar val) {
26   const int block_size = 512;
27   const CeedSize vec_size = length;
28   int grid_size = vec_size / block_size;
29 
30   if (block_size * grid_size < vec_size) grid_size += 1;
31   setValueK<<<grid_size,block_size>>>(d_array, length, val);
32   return 0;
33 }
34 
35 //------------------------------------------------------------------------------
36 // Kernel for taking reciprocal
37 //------------------------------------------------------------------------------
38 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) {
39   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
40   if (index >= size) return;
41   if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index];
42 }
43 
44 //------------------------------------------------------------------------------
45 // Take vector reciprocal in device memory
46 //------------------------------------------------------------------------------
47 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) {
48   const int block_size = 512;
49   const CeedSize vec_size = length;
50   int grid_size = vec_size / block_size;
51 
52   if (block_size * grid_size < vec_size) grid_size += 1;
53   rcpValueK<<<grid_size,block_size>>>(d_array, length);
54   return 0;
55 }
56 
57 //------------------------------------------------------------------------------
58 // Kernel for scale
59 //------------------------------------------------------------------------------
60 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
61     CeedSize size) {
62   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
63   if (index >= size) return;
64   x[index] *= alpha;
65 }
66 
67 //------------------------------------------------------------------------------
68 // Compute x = alpha x on device
69 //------------------------------------------------------------------------------
70 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
71     CeedSize length) {
72   const int block_size = 512;
73   const CeedSize vec_size = length;
74   int grid_size = vec_size / block_size;
75 
76   if (block_size * grid_size < vec_size) grid_size += 1;
77   scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length);
78   return 0;
79 }
80 
81 //------------------------------------------------------------------------------
82 // Kernel for axpy
83 //------------------------------------------------------------------------------
84 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
85     CeedScalar * __restrict__ x, CeedSize size) {
86   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
87   if (index >= size) return;
88   y[index] += alpha * x[index];
89 }
90 
91 //------------------------------------------------------------------------------
92 // Compute y = alpha x + y on device
93 //------------------------------------------------------------------------------
94 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
95     CeedScalar *x_array, CeedSize length) {
96   const int block_size = 512;
97   const CeedSize vec_size = length;
98   int grid_size = vec_size / block_size;
99 
100   if (block_size * grid_size < vec_size) grid_size += 1;
101   axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length);
102   return 0;
103 }
104 
105 //------------------------------------------------------------------------------
106 // Kernel for axpby
107 //------------------------------------------------------------------------------
108 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta,
109     CeedScalar * __restrict__ x, CeedSize size) {
110   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
111   if (index >= size) return;
112   y[index] = beta * y[index];
113   y[index] += alpha * x[index];
114 }
115 
116 //------------------------------------------------------------------------------
117 // Compute y = alpha x + beta y on device
118 //------------------------------------------------------------------------------
119 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta,
120     CeedScalar *x_array, CeedSize length) {
121   const int block_size = 512;
122   const CeedSize vec_size = length;
123   int grid_size = vec_size / block_size;
124 
125   if (block_size * grid_size < vec_size) grid_size += 1;
126   axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length);
127   return 0;
128 }
129 
130 //------------------------------------------------------------------------------
131 // Kernel for pointwise mult
132 //------------------------------------------------------------------------------
133 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
134     CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) {
135   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
136   if (index >= size) return;
137   w[index] = x[index] * y[index];
138 }
139 
140 //------------------------------------------------------------------------------
141 // Compute the pointwise multiplication w = x .* y on device
142 //------------------------------------------------------------------------------
143 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
144     CeedScalar *y_array, CeedSize length) {
145   const int block_size = 512;
146   const CeedSize vec_size = length;
147   int grid_size = vec_size / block_size;
148 
149   if (block_size * grid_size < vec_size) grid_size += 1;
150   pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length);
151   return 0;
152 }
153 
154 //------------------------------------------------------------------------------
155