xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision b73fa92ca232cd7a1379d6ffb54e013a90896973)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
849aac155SJeremy L Thompson #include <ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
12f1c2287bSJeremy L Thompson // Kernel for copy strided on device
13f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
14*b73fa92cSJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) {
15f1c2287bSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
16f1c2287bSJeremy L Thompson   if (index >= size) return;
17f1c2287bSJeremy L Thompson   if ((index - start) % step == 0) vec_copy[index] = vec[index];
18f1c2287bSJeremy L Thompson }
19f1c2287bSJeremy L Thompson 
20f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
21f1c2287bSJeremy L Thompson // Copy strided on device memory
22f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
23*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) {
24f1c2287bSJeremy L Thompson   const int      block_size = 512;
25f1c2287bSJeremy L Thompson   const CeedSize vec_size   = length;
26f1c2287bSJeremy L Thompson   int            grid_size  = vec_size / block_size;
27f1c2287bSJeremy L Thompson 
28f1c2287bSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
29f1c2287bSJeremy L Thompson   copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array);
30f1c2287bSJeremy L Thompson   return 0;
31f1c2287bSJeremy L Thompson }
32f1c2287bSJeremy L Thompson 
33f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
340d0321e0SJeremy L Thompson // Kernel for set value on device
350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
36*b73fa92cSJeremy L Thompson __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) {
37ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
389ef22048SJeremy L Thompson   if (index >= size) return;
39ca735530SJeremy L Thompson   vec[index] = val;
400d0321e0SJeremy L Thompson }
410d0321e0SJeremy L Thompson 
420d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
430d0321e0SJeremy L Thompson // Set value on device memory
440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
45*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) {
46ca735530SJeremy L Thompson   const int      block_size = 512;
47ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
48ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
490d0321e0SJeremy L Thompson 
509ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
51ca735530SJeremy L Thompson   setValueK<<<grid_size, block_size>>>(d_array, length, val);
520d0321e0SJeremy L Thompson   return 0;
530d0321e0SJeremy L Thompson }
540d0321e0SJeremy L Thompson 
550d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
56f1c2287bSJeremy L Thompson // Kernel for set value strided on device
57f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
58*b73fa92cSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) {
59f1c2287bSJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
60f1c2287bSJeremy L Thompson   if (index >= size) return;
61f1c2287bSJeremy L Thompson   if ((index - start) % step == 0) vec[index] = val;
62f1c2287bSJeremy L Thompson }
63f1c2287bSJeremy L Thompson 
64f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
65f1c2287bSJeremy L Thompson // Set value strided on device memory
66f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
67*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
68f1c2287bSJeremy L Thompson   const int      block_size = 512;
69f1c2287bSJeremy L Thompson   const CeedSize vec_size   = length;
70f1c2287bSJeremy L Thompson   int            grid_size  = vec_size / block_size;
71f1c2287bSJeremy L Thompson 
72f1c2287bSJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
73f1c2287bSJeremy L Thompson   setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val);
74f1c2287bSJeremy L Thompson   return 0;
75f1c2287bSJeremy L Thompson }
76f1c2287bSJeremy L Thompson 
77f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------
780d0321e0SJeremy L Thompson // Kernel for taking reciprocal
790d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
80f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) {
81ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
829ef22048SJeremy L Thompson   if (index >= size) return;
839ef22048SJeremy L Thompson   if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index];
840d0321e0SJeremy L Thompson }
850d0321e0SJeremy L Thompson 
860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
870d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
89f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) {
90ca735530SJeremy L Thompson   const int      block_size = 512;
91ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
92ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
930d0321e0SJeremy L Thompson 
949ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
95ca735530SJeremy L Thompson   rcpValueK<<<grid_size, block_size>>>(d_array, length);
960d0321e0SJeremy L Thompson   return 0;
970d0321e0SJeremy L Thompson }
980d0321e0SJeremy L Thompson 
990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1000d0321e0SJeremy L Thompson // Kernel for scale
1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
102*b73fa92cSJeremy L Thompson __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) {
103ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1049ef22048SJeremy L Thompson   if (index >= size) return;
105ca735530SJeremy L Thompson   x[index] *= alpha;
1060d0321e0SJeremy L Thompson }
1070d0321e0SJeremy L Thompson 
1080d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1090d0321e0SJeremy L Thompson // Compute x = alpha x on device
1100d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
111*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
112ca735530SJeremy L Thompson   const int      block_size = 512;
113ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
114ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1150d0321e0SJeremy L Thompson 
1169ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
117ca735530SJeremy L Thompson   scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length);
1180d0321e0SJeremy L Thompson   return 0;
1190d0321e0SJeremy L Thompson }
1200d0321e0SJeremy L Thompson 
1210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1220d0321e0SJeremy L Thompson // Kernel for axpy
1230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
124*b73fa92cSJeremy L Thompson __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) {
125ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1269ef22048SJeremy L Thompson   if (index >= size) return;
127ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1280d0321e0SJeremy L Thompson }
1290d0321e0SJeremy L Thompson 
1300d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1310d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
133*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
134ca735530SJeremy L Thompson   const int      block_size = 512;
135ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
136ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1370d0321e0SJeremy L Thompson 
1389ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
139ca735530SJeremy L Thompson   axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length);
1400d0321e0SJeremy L Thompson   return 0;
1410d0321e0SJeremy L Thompson }
1420d0321e0SJeremy L Thompson 
1430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1445fb68f37SKaren (Ren) Stengel // Kernel for axpby
1455fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
146*b73fa92cSJeremy L Thompson __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) {
147ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1489ef22048SJeremy L Thompson   if (index >= size) return;
149ca735530SJeremy L Thompson   y[index] = beta * y[index];
150ca735530SJeremy L Thompson   y[index] += alpha * x[index];
1515fb68f37SKaren (Ren) Stengel }
1525fb68f37SKaren (Ren) Stengel 
1535fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1545fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device
1555fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
156*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
157ca735530SJeremy L Thompson   const int      block_size = 512;
158ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
159ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1605fb68f37SKaren (Ren) Stengel 
1619ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
162ca735530SJeremy L Thompson   axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length);
1635fb68f37SKaren (Ren) Stengel   return 0;
1645fb68f37SKaren (Ren) Stengel }
1655fb68f37SKaren (Ren) Stengel 
1665fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------
1670d0321e0SJeremy L Thompson // Kernel for pointwise mult
1680d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
169*b73fa92cSJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) {
170ca735530SJeremy L Thompson   CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x;
1719ef22048SJeremy L Thompson   if (index >= size) return;
172ca735530SJeremy L Thompson   w[index] = x[index] * y[index];
1730d0321e0SJeremy L Thompson }
1740d0321e0SJeremy L Thompson 
1750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1760d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1770d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
178*b73fa92cSJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
179ca735530SJeremy L Thompson   const int      block_size = 512;
180ca735530SJeremy L Thompson   const CeedSize vec_size   = length;
181ca735530SJeremy L Thompson   int            grid_size  = vec_size / block_size;
1820d0321e0SJeremy L Thompson 
1839ef22048SJeremy L Thompson   if (block_size * grid_size < vec_size) grid_size += 1;
184ca735530SJeremy L Thompson   pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length);
1850d0321e0SJeremy L Thompson   return 0;
1860d0321e0SJeremy L Thompson }
1872a86cc9dSSebastian Grimberg 
1882a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
189