13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <cuda.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 120d0321e0SJeremy L Thompson // Kernel for set value on device 130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 14f7c1b517Snbeams __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 150d0321e0SJeremy L Thompson CeedScalar val) { 16*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17*ca735530SJeremy L Thompson if (index >= size) 180d0321e0SJeremy L Thompson return; 19*ca735530SJeremy L Thompson vec[index] = val; 200d0321e0SJeremy L Thompson } 210d0321e0SJeremy L Thompson 220d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 230d0321e0SJeremy L Thompson // Set value on device memory 240d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 25f7c1b517Snbeams extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 260d0321e0SJeremy L Thompson CeedScalar val) { 27*ca735530SJeremy L Thompson const int block_size = 512; 28*ca735530SJeremy L Thompson const CeedSize vec_size = length; 29*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 300d0321e0SJeremy L Thompson 31*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 32*ca735530SJeremy L Thompson grid_size += 1; 33*ca735530SJeremy L Thompson setValueK<<<grid_size,block_size>>>(d_array, length, val); 340d0321e0SJeremy L Thompson return 0; 350d0321e0SJeremy L Thompson } 360d0321e0SJeremy L Thompson 370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 380d0321e0SJeremy L Thompson // Kernel for taking reciprocal 390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 40f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 41*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 42*ca735530SJeremy L Thompson if (index >= size) 430d0321e0SJeremy L Thompson return; 44*ca735530SJeremy L Thompson if (fabs(vec[index]) > 1E-16) 45*ca735530SJeremy L Thompson vec[index] = 1./vec[index]; 460d0321e0SJeremy L Thompson } 470d0321e0SJeremy L Thompson 480d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 490d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 500d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 51f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 52*ca735530SJeremy L Thompson const int block_size = 512; 53*ca735530SJeremy L Thompson const CeedSize vec_size = length; 54*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 550d0321e0SJeremy L Thompson 56*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 57*ca735530SJeremy L Thompson grid_size += 1; 58*ca735530SJeremy L Thompson rcpValueK<<<grid_size,block_size>>>(d_array, length); 590d0321e0SJeremy L Thompson return 0; 600d0321e0SJeremy L Thompson } 610d0321e0SJeremy L Thompson 620d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 630d0321e0SJeremy L Thompson // Kernel for scale 640d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 650d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 66f7c1b517Snbeams CeedSize size) { 67*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 68*ca735530SJeremy L Thompson if (index >= size) 690d0321e0SJeremy L Thompson return; 70*ca735530SJeremy L Thompson x[index] *= alpha; 710d0321e0SJeremy L Thompson } 720d0321e0SJeremy L Thompson 730d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 740d0321e0SJeremy L Thompson // Compute x = alpha x on device 750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 760d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 77f7c1b517Snbeams CeedSize length) { 78*ca735530SJeremy L Thompson const int block_size = 512; 79*ca735530SJeremy L Thompson const CeedSize vec_size = length; 80*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 810d0321e0SJeremy L Thompson 82*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 83*ca735530SJeremy L Thompson grid_size += 1; 84*ca735530SJeremy L Thompson scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 850d0321e0SJeremy L Thompson return 0; 860d0321e0SJeremy L Thompson } 870d0321e0SJeremy L Thompson 880d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 890d0321e0SJeremy L Thompson // Kernel for axpy 900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 910d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 92f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 93*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 94*ca735530SJeremy L Thompson if (index >= size) 950d0321e0SJeremy L Thompson return; 96*ca735530SJeremy L Thompson y[index] += alpha * x[index]; 970d0321e0SJeremy L Thompson } 980d0321e0SJeremy L Thompson 990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1000d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1020d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 103f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 104*ca735530SJeremy L Thompson const int block_size = 512; 105*ca735530SJeremy L Thompson const CeedSize vec_size = length; 106*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1070d0321e0SJeremy L Thompson 108*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 109*ca735530SJeremy L Thompson grid_size += 1; 110*ca735530SJeremy L Thompson axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 1110d0321e0SJeremy L Thompson return 0; 1120d0321e0SJeremy L Thompson } 1130d0321e0SJeremy L Thompson 1140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1155fb68f37SKaren (Ren) Stengel // Kernel for axpby 1165fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1175fb68f37SKaren (Ren) Stengel __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 118f7c1b517Snbeams CeedScalar * __restrict__ x, CeedSize size) { 119*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 120*ca735530SJeremy L Thompson if (index >= size) 1215fb68f37SKaren (Ren) Stengel return; 122*ca735530SJeremy L Thompson y[index] = beta * y[index]; 123*ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1245fb68f37SKaren (Ren) Stengel } 1255fb68f37SKaren (Ren) Stengel 1265fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1275fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1285fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1295fb68f37SKaren (Ren) Stengel extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 130f7c1b517Snbeams CeedScalar *x_array, CeedSize length) { 131*ca735530SJeremy L Thompson const int block_size = 512; 132*ca735530SJeremy L Thompson const CeedSize vec_size = length; 133*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1345fb68f37SKaren (Ren) Stengel 135*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 136*ca735530SJeremy L Thompson grid_size += 1; 137*ca735530SJeremy L Thompson axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 1385fb68f37SKaren (Ren) Stengel return 0; 1395fb68f37SKaren (Ren) Stengel } 1405fb68f37SKaren (Ren) Stengel 1415fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1420d0321e0SJeremy L Thompson // Kernel for pointwise mult 1430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1440d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 145f7c1b517Snbeams CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 146*ca735530SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 147*ca735530SJeremy L Thompson if (index >= size) 1480d0321e0SJeremy L Thompson return; 149*ca735530SJeremy L Thompson w[index] = x[index] * y[index]; 1500d0321e0SJeremy L Thompson } 1510d0321e0SJeremy L Thompson 1520d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1530d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1550d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 156f7c1b517Snbeams CeedScalar *y_array, CeedSize length) { 157*ca735530SJeremy L Thompson const int block_size = 512; 158*ca735530SJeremy L Thompson const CeedSize vec_size = length; 159*ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1600d0321e0SJeremy L Thompson 161*ca735530SJeremy L Thompson if (block_size * grid_size < vec_size) 162*ca735530SJeremy L Thompson grid_size += 1; 163*ca735530SJeremy L Thompson pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 1640d0321e0SJeremy L Thompson return 0; 1650d0321e0SJeremy L Thompson } 1662a86cc9dSSebastian Grimberg 1672a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 168