15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <cuda.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 120d0321e0SJeremy L Thompson // Kernel for set value on device 130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 14f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 150d0321e0SJeremy L Thompson CeedScalar val) { 16ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17*9ef22048SJeremy L Thompson if (index >= size) return; 18ca735530SJeremy L Thompson vec[index] = val; 190d0321e0SJeremy L Thompson } 200d0321e0SJeremy L Thompson 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 220d0321e0SJeremy L Thompson // Set value on device memory 230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 24f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 250d0321e0SJeremy L Thompson CeedScalar val) { 26ca735530SJeremy L Thompson const int block_size = 512; 27ca735530SJeremy L Thompson const CeedSize vec_size = length; 28ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 290d0321e0SJeremy L Thompson 30*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 31ca735530SJeremy L Thompson setValueK<<<grid_size,block_size>>>(d_array, length, val); 320d0321e0SJeremy L Thompson return 0; 330d0321e0SJeremy L Thompson } 340d0321e0SJeremy L Thompson 350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 360d0321e0SJeremy L Thompson // Kernel for taking reciprocal 370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 38f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 39ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 40*9ef22048SJeremy L Thompson if (index >= size) return; 41*9ef22048SJeremy L Thompson if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index]; 420d0321e0SJeremy L Thompson } 430d0321e0SJeremy L Thompson 440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 450d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 47f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 48ca735530SJeremy L Thompson const int block_size = 512; 49ca735530SJeremy L Thompson const CeedSize vec_size = length; 50ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 510d0321e0SJeremy L Thompson 52*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 53ca735530SJeremy L Thompson rcpValueK<<<grid_size,block_size>>>(d_array, length); 540d0321e0SJeremy L Thompson return 0; 550d0321e0SJeremy L Thompson } 560d0321e0SJeremy L Thompson 570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 580d0321e0SJeremy L Thompson // Kernel for scale 590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 600d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 61f7c1b517Snbeams CeedSize size) { 62ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 63*9ef22048SJeremy L Thompson if (index >= size) return; 64ca735530SJeremy L Thompson x[index] *= alpha; 650d0321e0SJeremy L Thompson } 660d0321e0SJeremy L Thompson 670d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 680d0321e0SJeremy L Thompson // Compute x = alpha x on device 690d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 700d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 71f7c1b517Snbeams CeedSize length) { 72ca735530SJeremy L Thompson const int block_size = 512; 73ca735530SJeremy L Thompson const CeedSize vec_size = length; 74ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 750d0321e0SJeremy L Thompson 76*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 77ca735530SJeremy L Thompson scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 780d0321e0SJeremy L Thompson return 0; 790d0321e0SJeremy L Thompson } 800d0321e0SJeremy L Thompson 810d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 820d0321e0SJeremy L Thompson // Kernel for axpy 830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 840d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 85f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 86ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 87*9ef22048SJeremy L Thompson if (index >= size) return; 88ca735530SJeremy L Thompson y[index] += alpha * x[index]; 890d0321e0SJeremy L Thompson } 900d0321e0SJeremy L Thompson 910d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 920d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 930d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 940d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 95f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 96ca735530SJeremy L Thompson const int block_size = 512; 97ca735530SJeremy L Thompson const CeedSize vec_size = length; 98ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 990d0321e0SJeremy L Thompson 100*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 101ca735530SJeremy L Thompson axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 1020d0321e0SJeremy L Thompson return 0; 1030d0321e0SJeremy L Thompson } 1040d0321e0SJeremy L Thompson 1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1065fb68f37SKaren (Ren) Stengel // Kernel for axpby 1075fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1085fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 109f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 110ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 111*9ef22048SJeremy L Thompson if (index >= size) return; 112ca735530SJeremy L Thompson y[index] = beta * y[index]; 113ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1145fb68f37SKaren (Ren) Stengel } 1155fb68f37SKaren (Ren) Stengel 1165fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1175fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1185fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1195fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 120f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 121ca735530SJeremy L Thompson const int block_size = 512; 122ca735530SJeremy L Thompson const CeedSize vec_size = length; 123ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1245fb68f37SKaren (Ren) Stengel 125*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 126ca735530SJeremy L Thompson axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 1275fb68f37SKaren (Ren) Stengel return 0; 1285fb68f37SKaren (Ren) Stengel } 1295fb68f37SKaren (Ren) Stengel 1305fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1310d0321e0SJeremy L Thompson // Kernel for pointwise mult 1320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1330d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 134f7c1b517Snbeams CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 135ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 136*9ef22048SJeremy L Thompson if (index >= size) return; 137ca735530SJeremy L Thompson w[index] = x[index] * y[index]; 1380d0321e0SJeremy L Thompson } 1390d0321e0SJeremy L Thompson 1400d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1410d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1420d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1430d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 144f7c1b517Snbeams CeedScalar *y_array, CeedSize length) { 145ca735530SJeremy L Thompson const int block_size = 512; 146ca735530SJeremy L Thompson const CeedSize vec_size = length; 147ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1480d0321e0SJeremy L Thompson 149*9ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 150ca735530SJeremy L Thompson pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 1510d0321e0SJeremy L Thompson return 0; 1520d0321e0SJeremy L Thompson } 1532a86cc9dSSebastian Grimberg 1542a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 155