15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <cuda.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 12f1c2287bSJeremy L Thompson // Kernel for copy strided on device 13f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 14*b73fa92cSJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { 15f1c2287bSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16f1c2287bSJeremy L Thompson if (index >= size) return; 17f1c2287bSJeremy L Thompson if ((index - start) % step == 0) vec_copy[index] = vec[index]; 18f1c2287bSJeremy L Thompson } 19f1c2287bSJeremy L Thompson 20f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 21f1c2287bSJeremy L Thompson // Copy strided on device memory 22f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 23*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { 24f1c2287bSJeremy L Thompson const int block_size = 512; 25f1c2287bSJeremy L Thompson const CeedSize vec_size = length; 26f1c2287bSJeremy L Thompson int grid_size = vec_size / block_size; 27f1c2287bSJeremy L Thompson 28f1c2287bSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 29f1c2287bSJeremy L Thompson copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array); 30f1c2287bSJeremy L Thompson return 0; 31f1c2287bSJeremy L Thompson } 32f1c2287bSJeremy L Thompson 33f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 340d0321e0SJeremy L Thompson // Kernel for set value on device 350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 36*b73fa92cSJeremy L Thompson __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 37ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 389ef22048SJeremy L Thompson if (index >= size) return; 39ca735530SJeremy L Thompson vec[index] = val; 400d0321e0SJeremy L Thompson } 410d0321e0SJeremy L Thompson 420d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 430d0321e0SJeremy L Thompson // Set value on device memory 440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 45*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) { 46ca735530SJeremy L Thompson const int block_size = 512; 47ca735530SJeremy L Thompson const CeedSize vec_size = length; 48ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 490d0321e0SJeremy L Thompson 509ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 51ca735530SJeremy L Thompson setValueK<<<grid_size, block_size>>>(d_array, length, val); 520d0321e0SJeremy L Thompson return 0; 530d0321e0SJeremy L Thompson } 540d0321e0SJeremy L Thompson 550d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 56f1c2287bSJeremy L Thompson // Kernel for set value strided on device 57f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 58*b73fa92cSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) { 59f1c2287bSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 60f1c2287bSJeremy L Thompson if (index >= size) return; 61f1c2287bSJeremy L Thompson if ((index - start) % step == 0) vec[index] = val; 62f1c2287bSJeremy L Thompson } 63f1c2287bSJeremy L Thompson 64f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 65f1c2287bSJeremy L Thompson // Set value strided on device memory 66f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 67*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) { 68f1c2287bSJeremy L Thompson const int block_size = 512; 69f1c2287bSJeremy L Thompson const CeedSize vec_size = length; 70f1c2287bSJeremy L Thompson int grid_size = vec_size / block_size; 71f1c2287bSJeremy L Thompson 72f1c2287bSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 73f1c2287bSJeremy L Thompson setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val); 74f1c2287bSJeremy L Thompson return 0; 75f1c2287bSJeremy L Thompson } 76f1c2287bSJeremy L Thompson 77f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 780d0321e0SJeremy L Thompson // Kernel for taking reciprocal 790d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 80f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 81ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 829ef22048SJeremy L Thompson if (index >= size) return; 839ef22048SJeremy L Thompson if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 840d0321e0SJeremy L Thompson } 850d0321e0SJeremy L Thompson 860d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 870d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 89f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) { 90ca735530SJeremy L Thompson const int block_size = 512; 91ca735530SJeremy L Thompson const CeedSize vec_size = length; 92ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 930d0321e0SJeremy L Thompson 949ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 95ca735530SJeremy L Thompson rcpValueK<<<grid_size, block_size>>>(d_array, length); 960d0321e0SJeremy L Thompson return 0; 970d0321e0SJeremy L Thompson } 980d0321e0SJeremy L Thompson 990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1000d0321e0SJeremy L Thompson // Kernel for scale 1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 102*b73fa92cSJeremy L Thompson __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 103ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1049ef22048SJeremy L Thompson if (index >= size) return; 105ca735530SJeremy L Thompson x[index] *= alpha; 1060d0321e0SJeremy L Thompson } 1070d0321e0SJeremy L Thompson 1080d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1090d0321e0SJeremy L Thompson // Compute x = alpha x on device 1100d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 111*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 112ca735530SJeremy L Thompson const int block_size = 512; 113ca735530SJeremy L Thompson const CeedSize vec_size = length; 114ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1150d0321e0SJeremy L Thompson 1169ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 117ca735530SJeremy L Thompson scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length); 1180d0321e0SJeremy L Thompson return 0; 1190d0321e0SJeremy L Thompson } 1200d0321e0SJeremy L Thompson 1210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1220d0321e0SJeremy L Thompson // Kernel for axpy 1230d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 124*b73fa92cSJeremy L Thompson __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 125ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1269ef22048SJeremy L Thompson if (index >= size) return; 127ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1280d0321e0SJeremy L Thompson } 1290d0321e0SJeremy L Thompson 1300d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1310d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 1320d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 133*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 134ca735530SJeremy L Thompson const int block_size = 512; 135ca735530SJeremy L Thompson const CeedSize vec_size = length; 136ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1370d0321e0SJeremy L Thompson 1389ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 139ca735530SJeremy L Thompson axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length); 1400d0321e0SJeremy L Thompson return 0; 1410d0321e0SJeremy L Thompson } 1420d0321e0SJeremy L Thompson 1430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1445fb68f37SKaren (Ren) Stengel // Kernel for axpby 1455fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 146*b73fa92cSJeremy L Thompson __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 147ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1489ef22048SJeremy L Thompson if (index >= size) return; 149ca735530SJeremy L Thompson y[index] = beta * y[index]; 150ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1515fb68f37SKaren (Ren) Stengel } 1525fb68f37SKaren (Ren) Stengel 1535fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1545fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1555fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 156*b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 157ca735530SJeremy L Thompson const int block_size = 512; 158ca735530SJeremy L Thompson const CeedSize vec_size = length; 159ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1605fb68f37SKaren (Ren) Stengel 1619ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 162ca735530SJeremy L Thompson axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length); 1635fb68f37SKaren (Ren) Stengel return 0; 1645fb68f37SKaren (Ren) Stengel } 1655fb68f37SKaren (Ren) Stengel 1665fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1670d0321e0SJeremy L Thompson // Kernel for pointwise mult 1680d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 169*b73fa92cSJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 170ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 1719ef22048SJeremy L Thompson if (index >= size) return; 172ca735530SJeremy L Thompson w[index] = x[index] * y[index]; 1730d0321e0SJeremy L Thompson } 1740d0321e0SJeremy L Thompson 1750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1760d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1770d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 178*b73fa92cSJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 179ca735530SJeremy L Thompson const int block_size = 512; 180ca735530SJeremy L Thompson const CeedSize vec_size = length; 181ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1820d0321e0SJeremy L Thompson 1839ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 184ca735530SJeremy L Thompson pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length); 1850d0321e0SJeremy L Thompson return 0; 1860d0321e0SJeremy L Thompson } 1872a86cc9dSSebastian Grimberg 1882a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 189