xref: /libCEED/backends/cuda-ref/kernels/cuda-ref-vector.cu (revision 3d8e882215d238700cdceb37404f76ca7fa24eaa) !
1*3d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*3d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
30d0321e0SJeremy L Thompson //
4*3d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
50d0321e0SJeremy L Thompson //
6*3d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
70d0321e0SJeremy L Thompson 
80d0321e0SJeremy L Thompson #include <ceed/ceed.h>
90d0321e0SJeremy L Thompson #include <cuda.h>
100d0321e0SJeremy L Thompson 
110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
120d0321e0SJeremy L Thompson // Kernel for set value on device
130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
140d0321e0SJeremy L Thompson __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size,
150d0321e0SJeremy L Thompson                                  CeedScalar val) {
160d0321e0SJeremy L Thompson   int idx = threadIdx.x + blockDim.x * blockIdx.x;
170d0321e0SJeremy L Thompson   if (idx >= size)
180d0321e0SJeremy L Thompson     return;
190d0321e0SJeremy L Thompson   vec[idx] = val;
200d0321e0SJeremy L Thompson }
210d0321e0SJeremy L Thompson 
220d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
230d0321e0SJeremy L Thompson // Set value on device memory
240d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
250d0321e0SJeremy L Thompson extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedInt length,
260d0321e0SJeremy L Thompson                                        CeedScalar val) {
270d0321e0SJeremy L Thompson   const int bsize = 512;
280d0321e0SJeremy L Thompson   const int vecsize = length;
290d0321e0SJeremy L Thompson   int gridsize = vecsize / bsize;
300d0321e0SJeremy L Thompson 
310d0321e0SJeremy L Thompson   if (bsize * gridsize < vecsize)
320d0321e0SJeremy L Thompson     gridsize += 1;
330d0321e0SJeremy L Thompson   setValueK<<<gridsize,bsize>>>(d_array, length, val);
340d0321e0SJeremy L Thompson   return 0;
350d0321e0SJeremy L Thompson }
360d0321e0SJeremy L Thompson 
370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
380d0321e0SJeremy L Thompson // Kernel for taking reciprocal
390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
400d0321e0SJeremy L Thompson __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) {
410d0321e0SJeremy L Thompson   int idx = threadIdx.x + blockDim.x * blockIdx.x;
420d0321e0SJeremy L Thompson   if (idx >= size)
430d0321e0SJeremy L Thompson     return;
440d0321e0SJeremy L Thompson   if (fabs(vec[idx]) > 1E-16)
450d0321e0SJeremy L Thompson     vec[idx] = 1./vec[idx];
460d0321e0SJeremy L Thompson }
470d0321e0SJeremy L Thompson 
480d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
490d0321e0SJeremy L Thompson // Take vector reciprocal in device memory
500d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
510d0321e0SJeremy L Thompson extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedInt length) {
520d0321e0SJeremy L Thompson   const int bsize = 512;
530d0321e0SJeremy L Thompson   const int vecsize = length;
540d0321e0SJeremy L Thompson   int gridsize = vecsize / bsize;
550d0321e0SJeremy L Thompson 
560d0321e0SJeremy L Thompson   if (bsize * gridsize < vecsize)
570d0321e0SJeremy L Thompson     gridsize += 1;
580d0321e0SJeremy L Thompson   rcpValueK<<<gridsize,bsize>>>(d_array, length);
590d0321e0SJeremy L Thompson   return 0;
600d0321e0SJeremy L Thompson }
610d0321e0SJeremy L Thompson 
620d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
630d0321e0SJeremy L Thompson // Kernel for scale
640d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
650d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha,
660d0321e0SJeremy L Thompson     CeedInt size) {
670d0321e0SJeremy L Thompson   int idx = threadIdx.x + blockDim.x * blockIdx.x;
680d0321e0SJeremy L Thompson   if (idx >= size)
690d0321e0SJeremy L Thompson     return;
700d0321e0SJeremy L Thompson   x[idx] *= alpha;
710d0321e0SJeremy L Thompson }
720d0321e0SJeremy L Thompson 
730d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
740d0321e0SJeremy L Thompson // Compute x = alpha x on device
750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
760d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha,
770d0321e0SJeremy L Thompson     CeedInt length) {
780d0321e0SJeremy L Thompson   const int bsize = 512;
790d0321e0SJeremy L Thompson   const int vecsize = length;
800d0321e0SJeremy L Thompson   int gridsize = vecsize / bsize;
810d0321e0SJeremy L Thompson 
820d0321e0SJeremy L Thompson   if (bsize * gridsize < vecsize)
830d0321e0SJeremy L Thompson     gridsize += 1;
840d0321e0SJeremy L Thompson   scaleValueK<<<gridsize,bsize>>>(x_array, alpha, length);
850d0321e0SJeremy L Thompson   return 0;
860d0321e0SJeremy L Thompson }
870d0321e0SJeremy L Thompson 
880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
890d0321e0SJeremy L Thompson // Kernel for axpy
900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
910d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha,
920d0321e0SJeremy L Thompson     CeedScalar * __restrict__ x, CeedInt size) {
930d0321e0SJeremy L Thompson   int idx = threadIdx.x + blockDim.x * blockIdx.x;
940d0321e0SJeremy L Thompson   if (idx >= size)
950d0321e0SJeremy L Thompson     return;
960d0321e0SJeremy L Thompson   y[idx] += alpha * x[idx];
970d0321e0SJeremy L Thompson }
980d0321e0SJeremy L Thompson 
990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1000d0321e0SJeremy L Thompson // Compute y = alpha x + y on device
1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1020d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha,
1030d0321e0SJeremy L Thompson     CeedScalar *x_array, CeedInt length) {
1040d0321e0SJeremy L Thompson   const int bsize = 512;
1050d0321e0SJeremy L Thompson   const int vecsize = length;
1060d0321e0SJeremy L Thompson   int gridsize = vecsize / bsize;
1070d0321e0SJeremy L Thompson 
1080d0321e0SJeremy L Thompson   if (bsize * gridsize < vecsize)
1090d0321e0SJeremy L Thompson     gridsize += 1;
1100d0321e0SJeremy L Thompson   axpyValueK<<<gridsize,bsize>>>(y_array, alpha, x_array, length);
1110d0321e0SJeremy L Thompson   return 0;
1120d0321e0SJeremy L Thompson }
1130d0321e0SJeremy L Thompson 
1140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1150d0321e0SJeremy L Thompson // Kernel for pointwise mult
1160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1170d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w,
1180d0321e0SJeremy L Thompson     CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) {
1190d0321e0SJeremy L Thompson   int idx = threadIdx.x + blockDim.x * blockIdx.x;
1200d0321e0SJeremy L Thompson   if (idx >= size)
1210d0321e0SJeremy L Thompson     return;
1220d0321e0SJeremy L Thompson   w[idx] = x[idx] * y[idx];
1230d0321e0SJeremy L Thompson }
1240d0321e0SJeremy L Thompson 
1250d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1260d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device
1270d0321e0SJeremy L Thompson //------------------------------------------------------------------------------
1280d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array,
1290d0321e0SJeremy L Thompson     CeedScalar *y_array, CeedInt length) {
1300d0321e0SJeremy L Thompson   const int bsize = 512;
1310d0321e0SJeremy L Thompson   const int vecsize = length;
1320d0321e0SJeremy L Thompson   int gridsize = vecsize / bsize;
1330d0321e0SJeremy L Thompson 
1340d0321e0SJeremy L Thompson   if (bsize * gridsize < vecsize)
1350d0321e0SJeremy L Thompson     gridsize += 1;
1360d0321e0SJeremy L Thompson   pointwiseMultValueK<<<gridsize,bsize>>>(w_array, x_array, y_array, length);
1370d0321e0SJeremy L Thompson   return 0;
1380d0321e0SJeremy L Thompson }
139