15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 12*3196072fSJeremy L Thompson // Kernel for copy strided on device 13*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 14*3196072fSJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { 15*3196072fSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16*3196072fSJeremy L Thompson 17*3196072fSJeremy L Thompson if (index >= size) return; 18*3196072fSJeremy L Thompson if ((index - start) % step == 0) vec_copy[index] = vec[index]; 19*3196072fSJeremy L Thompson } 20*3196072fSJeremy L Thompson 21*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 22*3196072fSJeremy L Thompson // Copy strided on device memory 23*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 24*3196072fSJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { 25*3196072fSJeremy L Thompson const int block_size = 512; 26*3196072fSJeremy L Thompson const CeedSize vec_size = length; 27*3196072fSJeremy L Thompson int grid_size = vec_size / block_size; 28*3196072fSJeremy L Thompson 29*3196072fSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 30*3196072fSJeremy L Thompson hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array); 31*3196072fSJeremy L Thompson return 0; 32*3196072fSJeremy L Thompson } 33*3196072fSJeremy L Thompson 34*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 350d0321e0SJeremy L Thompson // Kernel for set value on device 360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 379330daecSnbeams __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 38b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 39b7453713SJeremy L Thompson 40b7453713SJeremy L Thompson if (index >= size) return; 41b7453713SJeremy L Thompson vec[index] = val; 420d0321e0SJeremy L Thompson } 430d0321e0SJeremy L Thompson 440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 450d0321e0SJeremy L Thompson // Set value on device memory 460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 479330daecSnbeams extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) { 48b7453713SJeremy L Thompson const int block_size = 512; 49b7453713SJeremy L Thompson const CeedSize vec_size = length; 50b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 510d0321e0SJeremy L Thompson 52b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 53b7453713SJeremy L Thompson hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val); 540d0321e0SJeremy L Thompson return 0; 550d0321e0SJeremy L Thompson } 560d0321e0SJeremy L Thompson 570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 58*3196072fSJeremy L Thompson // Kernel for set value strided on device 59*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 60*3196072fSJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) { 61*3196072fSJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 62*3196072fSJeremy L Thompson 63*3196072fSJeremy L Thompson if (index >= size) return; 64*3196072fSJeremy L Thompson if ((index - start) % step == 0) vec[index] = val; 65*3196072fSJeremy L Thompson } 66*3196072fSJeremy L Thompson 67*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 68*3196072fSJeremy L Thompson // Set value strided on device memory 69*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 70*3196072fSJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) { 71*3196072fSJeremy L Thompson const int block_size = 512; 72*3196072fSJeremy L Thompson const CeedSize vec_size = length; 73*3196072fSJeremy L Thompson int grid_size = vec_size / block_size; 74*3196072fSJeremy L Thompson 75*3196072fSJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 76*3196072fSJeremy L Thompson hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val); 77*3196072fSJeremy L Thompson return 0; 78*3196072fSJeremy L Thompson } 79*3196072fSJeremy L Thompson 80*3196072fSJeremy L Thompson //------------------------------------------------------------------------------ 810d0321e0SJeremy L Thompson // Kernel for taking reciprocal 820d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 839330daecSnbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 84b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 85b7453713SJeremy L Thompson 86b7453713SJeremy L Thompson if (index >= size) return; 87b7453713SJeremy L Thompson if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 880d0321e0SJeremy L Thompson } 890d0321e0SJeremy L Thompson 900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 910d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 939330daecSnbeams extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { 94b7453713SJeremy L Thompson const int block_size = 512; 95b7453713SJeremy L Thompson const CeedSize vec_size = length; 96b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 970d0321e0SJeremy L Thompson 98b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 99b7453713SJeremy L Thompson hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length); 1000d0321e0SJeremy L Thompson return 0; 1010d0321e0SJeremy L Thompson } 1020d0321e0SJeremy L Thompson 1030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1040d0321e0SJeremy L Thompson // Kernel for scale 1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1069330daecSnbeams __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 107b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 108b7453713SJeremy L Thompson 109b7453713SJeremy L Thompson if (index >= size) return; 110b7453713SJeremy L Thompson x[index] *= alpha; 1110d0321e0SJeremy L Thompson } 1120d0321e0SJeremy L Thompson 1130d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1140d0321e0SJeremy L Thompson // Compute x = alpha x on device 1150d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1169330daecSnbeams extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 117b7453713SJeremy L Thompson const int block_size = 512; 118b7453713SJeremy L Thompson const CeedSize vec_size = length; 119b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 1200d0321e0SJeremy L Thompson 121b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 122b7453713SJeremy L Thompson hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length); 1230d0321e0SJeremy L Thompson return 0; 1240d0321e0SJeremy L Thompson } 1250d0321e0SJeremy L Thompson 1260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1270d0321e0SJeremy L Thompson // Kernel for axpy 1280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1299330daecSnbeams __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 130b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 131b7453713SJeremy L Thompson 132b7453713SJeremy L Thompson if (index >= size) return; 133b7453713SJeremy L Thompson y[index] += alpha * x[index]; 1340d0321e0SJeremy L Thompson } 1350d0321e0SJeremy L Thompson 1360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1370d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1399330daecSnbeams extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 140b7453713SJeremy L Thompson const int block_size = 512; 141b7453713SJeremy L Thompson const CeedSize vec_size = length; 142b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 1430d0321e0SJeremy L Thompson 144b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 145b7453713SJeremy L Thompson hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length); 1460d0321e0SJeremy L Thompson return 0; 1470d0321e0SJeremy L Thompson } 1480d0321e0SJeremy L Thompson 1490d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1505fb68f37SKaren (Ren) Stengel // Kernel for axpby 1515fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1529330daecSnbeams __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 153b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 154b7453713SJeremy L Thompson 155b7453713SJeremy L Thompson if (index >= size) return; 156b7453713SJeremy L Thompson y[index] = beta * y[index]; 157b7453713SJeremy L Thompson y[index] += alpha * x[index]; 1585fb68f37SKaren (Ren) Stengel } 1595fb68f37SKaren (Ren) Stengel 1605fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1615fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1625fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1639330daecSnbeams extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 164b7453713SJeremy L Thompson const int block_size = 512; 165b7453713SJeremy L Thompson const CeedSize vec_size = length; 166b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 1675fb68f37SKaren (Ren) Stengel 168b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 169b7453713SJeremy L Thompson hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length); 1705fb68f37SKaren (Ren) Stengel return 0; 1715fb68f37SKaren (Ren) Stengel } 1725fb68f37SKaren (Ren) Stengel 1735fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1740d0321e0SJeremy L Thompson // Kernel for pointwise mult 1750d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1769330daecSnbeams __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 177b7453713SJeremy L Thompson CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 178b7453713SJeremy L Thompson 179b7453713SJeremy L Thompson if (index >= size) return; 180b7453713SJeremy L Thompson w[index] = x[index] * y[index]; 1810d0321e0SJeremy L Thompson } 1820d0321e0SJeremy L Thompson 1830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1840d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1869330daecSnbeams extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 187b7453713SJeremy L Thompson const int block_size = 512; 188b7453713SJeremy L Thompson const CeedSize vec_size = length; 189b7453713SJeremy L Thompson int grid_size = vec_size / block_size; 1900d0321e0SJeremy L Thompson 191b7453713SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 192b7453713SJeremy L Thompson hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length); 1930d0321e0SJeremy L Thompson return 0; 1940d0321e0SJeremy L Thompson } 1952a86cc9dSSebastian Grimberg 1962a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 197