1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 849aac155SJeremy L Thompson #include <ceed.h> 90d0321e0SJeremy L Thompson #include <cuda.h> 100d0321e0SJeremy L Thompson 110d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 12f1c2287bSJeremy L Thompson // Kernel for copy strided on device 13f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 14832a6d73SJeremy L Thompson __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *__restrict__ vec_copy) { 15c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16c9d5affaSJeremy L Thompson 17832a6d73SJeremy L Thompson if (index < stop - start) { 18832a6d73SJeremy L Thompson if (index % step == 0) vec_copy[start + index] = vec[start + index]; 19f1c2287bSJeremy L Thompson } 20c9d5affaSJeremy L Thompson } 21f1c2287bSJeremy L Thompson 22f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 23f1c2287bSJeremy L Thompson // Copy strided on device memory 24f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 25832a6d73SJeremy L Thompson extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar *d_copy_array) { 26f1c2287bSJeremy L Thompson const int block_size = 512; 27832a6d73SJeremy L Thompson const CeedSize copy_size = stop - start; 28832a6d73SJeremy L Thompson int grid_size = copy_size / block_size; 29f1c2287bSJeremy L Thompson 30832a6d73SJeremy L Thompson if (block_size * grid_size < copy_size) grid_size += 1; 31832a6d73SJeremy L Thompson copyStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, d_copy_array); 32f1c2287bSJeremy L Thompson return 0; 33f1c2287bSJeremy L Thompson } 34f1c2287bSJeremy L Thompson 35f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 360d0321e0SJeremy L Thompson // Kernel for set value on device 370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 38b73fa92cSJeremy L Thompson __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 39c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 40c9d5affaSJeremy L Thompson 41c9d5affaSJeremy L Thompson if (index < size) vec[index] = val; 420d0321e0SJeremy L Thompson } 430d0321e0SJeremy L Thompson 440d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 450d0321e0SJeremy L Thompson // Set value on device memory 460d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 47b73fa92cSJeremy L Thompson extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) { 48ca735530SJeremy L Thompson const int block_size = 512; 49ca735530SJeremy L Thompson const CeedSize vec_size = length; 50ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 510d0321e0SJeremy L Thompson 529ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 53ca735530SJeremy L Thompson setValueK<<<grid_size, block_size>>>(d_array, length, val); 540d0321e0SJeremy L Thompson return 0; 550d0321e0SJeremy L Thompson } 560d0321e0SJeremy L Thompson 570d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 58f1c2287bSJeremy L Thompson // Kernel for set value strided on device 59f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 602d73a370SJeremy L Thompson __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { 61c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 62c9d5affaSJeremy L Thompson 6314c82621SJeremy L Thompson if (index < stop - start) { 6414c82621SJeremy L Thompson if (index % step == 0) vec[start + index] = val; 65f1c2287bSJeremy L Thompson } 66c9d5affaSJeremy L Thompson } 67f1c2287bSJeremy L Thompson 68f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 69f1c2287bSJeremy L Thompson // Set value strided on device memory 70f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 7114c82621SJeremy L Thompson extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize stop, CeedSize step, CeedScalar val) { 72f1c2287bSJeremy L Thompson const int block_size = 512; 7314c82621SJeremy L Thompson const CeedSize set_size = stop - start; 7414c82621SJeremy L Thompson int grid_size = set_size / block_size; 75f1c2287bSJeremy L Thompson 7614c82621SJeremy L Thompson if (block_size * grid_size < set_size) grid_size += 1; 772d73a370SJeremy L Thompson setValueStridedK<<<grid_size, block_size>>>(d_array, start, stop, step, val); 78f1c2287bSJeremy L Thompson return 0; 79f1c2287bSJeremy L Thompson } 80f1c2287bSJeremy L Thompson 81f1c2287bSJeremy L Thompson //------------------------------------------------------------------------------ 820d0321e0SJeremy L Thompson // Kernel for taking reciprocal 830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 84f7c1b517Snbeams __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 85c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 86c9d5affaSJeremy L Thompson 87c9d5affaSJeremy L Thompson if (index < size) { 889ef22048SJeremy L Thompson if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 890d0321e0SJeremy L Thompson } 90c9d5affaSJeremy L Thompson } 910d0321e0SJeremy L Thompson 920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 930d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 95f7c1b517Snbeams extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) { 96ca735530SJeremy L Thompson const int block_size = 512; 97ca735530SJeremy L Thompson const CeedSize vec_size = length; 98ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 990d0321e0SJeremy L Thompson 1009ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 101ca735530SJeremy L Thompson rcpValueK<<<grid_size, block_size>>>(d_array, length); 1020d0321e0SJeremy L Thompson return 0; 1030d0321e0SJeremy L Thompson } 1040d0321e0SJeremy L Thompson 1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1060d0321e0SJeremy L Thompson // Kernel for scale 1070d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 108b73fa92cSJeremy L Thompson __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 109c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 110c9d5affaSJeremy L Thompson 111c9d5affaSJeremy L Thompson if (index < size) x[index] *= alpha; 1120d0321e0SJeremy L Thompson } 1130d0321e0SJeremy L Thompson 1140d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1150d0321e0SJeremy L Thompson // Compute x = alpha x on device 1160d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 117b73fa92cSJeremy L Thompson extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 118ca735530SJeremy L Thompson const int block_size = 512; 119ca735530SJeremy L Thompson const CeedSize vec_size = length; 120ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1210d0321e0SJeremy L Thompson 1229ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 123ca735530SJeremy L Thompson scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length); 1240d0321e0SJeremy L Thompson return 0; 1250d0321e0SJeremy L Thompson } 1260d0321e0SJeremy L Thompson 1270d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1280d0321e0SJeremy L Thompson // Kernel for axpy 1290d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 130b73fa92cSJeremy L Thompson __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 131c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 132c9d5affaSJeremy L Thompson 133c9d5affaSJeremy L Thompson if (index < size) y[index] += alpha * x[index]; 1340d0321e0SJeremy L Thompson } 1350d0321e0SJeremy L Thompson 1360d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1370d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 139b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 140ca735530SJeremy L Thompson const int block_size = 512; 141ca735530SJeremy L Thompson const CeedSize vec_size = length; 142ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1430d0321e0SJeremy L Thompson 1449ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 145ca735530SJeremy L Thompson axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length); 1460d0321e0SJeremy L Thompson return 0; 1470d0321e0SJeremy L Thompson } 1480d0321e0SJeremy L Thompson 1490d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1505fb68f37SKaren (Ren) Stengel // Kernel for axpby 1515fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 152b73fa92cSJeremy L Thompson __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 153c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 154c9d5affaSJeremy L Thompson 155c9d5affaSJeremy L Thompson if (index < size) { 156ca735530SJeremy L Thompson y[index] = beta * y[index]; 157ca735530SJeremy L Thompson y[index] += alpha * x[index]; 1585fb68f37SKaren (Ren) Stengel } 159c9d5affaSJeremy L Thompson } 1605fb68f37SKaren (Ren) Stengel 1615fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1625fb68f37SKaren (Ren) Stengel // Compute y = alpha x + beta y on device 1635fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 164b73fa92cSJeremy L Thompson extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 165ca735530SJeremy L Thompson const int block_size = 512; 166ca735530SJeremy L Thompson const CeedSize vec_size = length; 167ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1685fb68f37SKaren (Ren) Stengel 1699ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 170ca735530SJeremy L Thompson axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length); 1715fb68f37SKaren (Ren) Stengel return 0; 1725fb68f37SKaren (Ren) Stengel } 1735fb68f37SKaren (Ren) Stengel 1745fb68f37SKaren (Ren) Stengel //------------------------------------------------------------------------------ 1750d0321e0SJeremy L Thompson // Kernel for pointwise mult 1760d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 177b73fa92cSJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 178c9d5affaSJeremy L Thompson const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 179c9d5affaSJeremy L Thompson 180c9d5affaSJeremy L Thompson if (index < size) w[index] = x[index] * y[index]; 1810d0321e0SJeremy L Thompson } 1820d0321e0SJeremy L Thompson 1830d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1840d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 1850d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 186b73fa92cSJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 187ca735530SJeremy L Thompson const int block_size = 512; 188ca735530SJeremy L Thompson const CeedSize vec_size = length; 189ca735530SJeremy L Thompson int grid_size = vec_size / block_size; 1900d0321e0SJeremy L Thompson 1919ef22048SJeremy L Thompson if (block_size * grid_size < vec_size) grid_size += 1; 192ca735530SJeremy L Thompson pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length); 1930d0321e0SJeremy L Thompson return 0; 1940d0321e0SJeremy L Thompson } 1952a86cc9dSSebastian Grimberg 1962a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------ 197