15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <cuda.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 12*f1c2287bSJeremy L Thompson // Kernel for copy strided on device 13*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 14*f1c2287bSJeremy L Thompson __global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step, 15*f1c2287bSJeremy L Thompson CeedSize size, CeedScalar * __restrict__ vec_copy) { 16*f1c2287bSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17*f1c2287bSJeremy L Thompson if (index >= size) return; 18*f1c2287bSJeremy L Thompson if ((index - start) % step == 0) vec_copy[index] = vec[index]; 19*f1c2287bSJeremy L Thompson } 20*f1c2287bSJeremy L Thompson 21*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 22*f1c2287bSJeremy L Thompson // Copy strided on device memory 23*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 24*f1c2287bSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step, 25*f1c2287bSJeremy L Thompson CeedSize length, CeedScalar* d_copy_array) { 26*f1c2287bSJeremy L Thompson const int block_size = 512; 27*f1c2287bSJeremy L Thompson const CeedSize vec_size = length; 28*f1c2287bSJeremy L Thompson int grid_size = vec_size / block_size; 29*f1c2287bSJeremy L Thompson 30*f1c2287bSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 31*f1c2287bSJeremy L Thompson copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array); 32*f1c2287bSJeremy L Thompson return 0; 33*f1c2287bSJeremy L Thompson } 34*f1c2287bSJeremy L Thompson 35*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 360d0321e0SJeremy L Thompson // Kernel for set value on device 370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 38f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 390d0321e0SJeremy L Thompson CeedScalar val) { 40ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 419ef22048SJeremy L Thompson if (index >= size) return; 42ca735530SJeremy L Thompson vec[index] = val; 430d0321e0SJeremy L Thompson } 440d0321e0SJeremy L Thompson 450d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 460d0321e0SJeremy L Thompson // Set value on device memory 470d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 48f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 490d0321e0SJeremy L Thompson CeedScalar val) { 50ca735530SJeremy L Thompson const int block_size = 512; 51ca735530SJeremy L Thompson const CeedSize vec_size = length; 52ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 530d0321e0SJeremy L Thompson 549ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 55ca735530SJeremy L Thompson setValueK<<<grid_size,block_size>>>(d_array, length, val); 560d0321e0SJeremy L Thompson return 0; 570d0321e0SJeremy L Thompson } 580d0321e0SJeremy L Thompson 590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 60*f1c2287bSJeremy L Thompson // Kernel for set value strided on device 61*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 62*f1c2287bSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step, 63*f1c2287bSJeremy L Thompson CeedSize size, CeedScalar val) { 64*f1c2287bSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 65*f1c2287bSJeremy L Thompson if (index >= size) return; 66*f1c2287bSJeremy L Thompson if ((index - start) % step == 0) vec[index] = val; 67*f1c2287bSJeremy L Thompson } 68*f1c2287bSJeremy L Thompson 69*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 70*f1c2287bSJeremy L Thompson // Set value strided on device memory 71*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 72*f1c2287bSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step, 73*f1c2287bSJeremy L Thompson CeedSize length, CeedScalar val) { 74*f1c2287bSJeremy L Thompson const int block_size = 512; 75*f1c2287bSJeremy L Thompson const CeedSize vec_size = length; 76*f1c2287bSJeremy L Thompson int grid_size = vec_size / block_size; 77*f1c2287bSJeremy L Thompson 78*f1c2287bSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 79*f1c2287bSJeremy L Thompson setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val); 80*f1c2287bSJeremy L Thompson return 0; 81*f1c2287bSJeremy L Thompson } 82*f1c2287bSJeremy L Thompson 83*f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 840d0321e0SJeremy L Thompson // Kernel for taking reciprocal 850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 86f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 87ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 889ef22048SJeremy L Thompson if (index >= size) return; 899ef22048SJeremy L Thompson if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index]; 900d0321e0SJeremy L Thompson } 910d0321e0SJeremy L Thompson 920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 930d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 95f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 96ca735530SJeremy L Thompson const int block_size = 512; 97ca735530SJeremy L Thompson const CeedSize vec_size = length; 98ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 990d0321e0SJeremy L Thompson 1009ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 101ca735530SJeremy L Thompson rcpValueK<<<grid_size,block_size>>>(d_array, length); 1020d0321e0SJeremy L Thompson return 0; 1030d0321e0SJeremy L Thompson } 1040d0321e0SJeremy L Thompson 1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1060d0321e0SJeremy L Thompson // Kernel for scale 1070d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1080d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 109f7c1b517Snbeams CeedSize size) { 110ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1119ef22048SJeremy L Thompson if (index >= size) return; 112ca735530SJeremy L Thompson x[index] *= alpha; 1130d0321e0SJeremy L Thompson } 1140d0321e0SJeremy L Thompson 1150d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1160d0321e0SJeremy L Thompson // Compute x = alpha x on device 1170d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1180d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 119f7c1b517Snbeams CeedSize length) { 120ca735530SJeremy L Thompson const int block_size = 512; 121ca735530SJeremy L Thompson const CeedSize vec_size = length; 122ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1230d0321e0SJeremy L Thompson 1249ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 125ca735530SJeremy L Thompson scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 1260d0321e0SJeremy L Thompson return 0; 1270d0321e0SJeremy L Thompson } 1280d0321e0SJeremy L Thompson 1290d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1300d0321e0SJeremy L Thompson // Kernel for axpy 1310d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1320d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 133f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 134ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1359ef22048SJeremy L Thompson if (index >= size) return; 136ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1370d0321e0SJeremy L Thompson } 1380d0321e0SJeremy L Thompson 1390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1400d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 1410d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1420d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 143f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 144ca735530SJeremy L Thompson const int block_size = 512; 145ca735530SJeremy L Thompson const CeedSize vec_size = length; 146ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1470d0321e0SJeremy L Thompson 1489ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 149ca735530SJeremy L Thompson axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 1500d0321e0SJeremy L Thompson return 0; 1510d0321e0SJeremy L Thompson } 1520d0321e0SJeremy L Thompson 1530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1545fb68f37SKaren (Ren) Stengel // Kernel for axpby 1555fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1565fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 157f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 158ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1599ef22048SJeremy L Thompson if (index >= size) return; 160ca735530SJeremy L Thompson y[index] = beta * y[index]; 161ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1625fb68f37SKaren (Ren) Stengel } 1635fb68f37SKaren (Ren) Stengel 1645fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1655fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1665fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1675fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 168f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 169ca735530SJeremy L Thompson const int block_size = 512; 170ca735530SJeremy L Thompson const CeedSize vec_size = length; 171ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1725fb68f37SKaren (Ren) Stengel 1739ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 174ca735530SJeremy L Thompson axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 1755fb68f37SKaren (Ren) Stengel return 0; 1765fb68f37SKaren (Ren) Stengel } 1775fb68f37SKaren (Ren) Stengel 1785fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1790d0321e0SJeremy L Thompson // Kernel for pointwise mult 1800d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1810d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 182f7c1b517Snbeams CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 183ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1849ef22048SJeremy L Thompson if (index >= size) return; 185ca735530SJeremy L Thompson w[index] = x[index] * y[index]; 1860d0321e0SJeremy L Thompson } 1870d0321e0SJeremy L Thompson 1880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1890d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1910d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 192f7c1b517Snbeams CeedScalar *y_array, CeedSize length) { 193ca735530SJeremy L Thompson const int block_size = 512; 194ca735530SJeremy L Thompson const CeedSize vec_size = length; 195ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1960d0321e0SJeremy L Thompson 1979ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 198ca735530SJeremy L Thompson pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 1990d0321e0SJeremy L Thompson return 0; 2000d0321e0SJeremy L Thompson } 2012a86cc9dSSebastian Grimberg 2022a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 203