1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for set value on device 13 //------------------------------------------------------------------------------ 14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 15 CeedScalar val) { 16 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17 if (index >= size) return; 18 vec[index] = val; 19 } 20 21 //------------------------------------------------------------------------------ 22 // Set value on device memory 23 //------------------------------------------------------------------------------ 24 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 25 CeedScalar val) { 26 const int block_size = 512; 27 const CeedSize vec_size = length; 28 int grid_size = vec_size / block_size; 29 30 if (block_size * grid_size < vec_size) grid_size += 1; 31 setValueK<<<grid_size,block_size>>>(d_array, length, val); 32 return 0; 33 } 34 35 //------------------------------------------------------------------------------ 36 // Kernel for taking reciprocal 37 //------------------------------------------------------------------------------ 38 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 39 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 40 if (index >= size) return; 41 if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index]; 42 } 43 44 //------------------------------------------------------------------------------ 45 // Take vector reciprocal in device memory 46 //------------------------------------------------------------------------------ 47 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 48 const int block_size = 512; 49 const CeedSize vec_size = length; 50 int grid_size = vec_size / block_size; 51 52 if (block_size * grid_size < vec_size) grid_size += 1; 53 rcpValueK<<<grid_size,block_size>>>(d_array, length); 54 return 0; 55 } 56 57 //------------------------------------------------------------------------------ 58 // Kernel for scale 59 //------------------------------------------------------------------------------ 60 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 61 CeedSize size) { 62 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 63 if (index >= size) return; 64 x[index] *= alpha; 65 } 66 67 //------------------------------------------------------------------------------ 68 // Compute x = alpha x on device 69 //------------------------------------------------------------------------------ 70 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 71 CeedSize length) { 72 const int block_size = 512; 73 const CeedSize vec_size = length; 74 int grid_size = vec_size / block_size; 75 76 if (block_size * grid_size < vec_size) grid_size += 1; 77 scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 78 return 0; 79 } 80 81 //------------------------------------------------------------------------------ 82 // Kernel for axpy 83 //------------------------------------------------------------------------------ 84 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 85 CeedScalar * __restrict__ x, CeedSize size) { 86 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 87 if (index >= size) return; 88 y[index] += alpha * x[index]; 89 } 90 91 //------------------------------------------------------------------------------ 92 // Compute y = alpha x + y on device 93 //------------------------------------------------------------------------------ 94 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 95 CeedScalar *x_array, CeedSize length) { 96 const int block_size = 512; 97 const CeedSize vec_size = length; 98 int grid_size = vec_size / block_size; 99 100 if (block_size * grid_size < vec_size) grid_size += 1; 101 axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 102 return 0; 103 } 104 105 //------------------------------------------------------------------------------ 106 // Kernel for axpby 107 //------------------------------------------------------------------------------ 108 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 109 CeedScalar * __restrict__ x, CeedSize size) { 110 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 111 if (index >= size) return; 112 y[index] = beta * y[index]; 113 y[index] += alpha * x[index]; 114 } 115 116 //------------------------------------------------------------------------------ 117 // Compute y = alpha x + beta y on device 118 //------------------------------------------------------------------------------ 119 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 120 CeedScalar *x_array, CeedSize length) { 121 const int block_size = 512; 122 const CeedSize vec_size = length; 123 int grid_size = vec_size / block_size; 124 125 if (block_size * grid_size < vec_size) grid_size += 1; 126 axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 127 return 0; 128 } 129 130 //------------------------------------------------------------------------------ 131 // Kernel for pointwise mult 132 //------------------------------------------------------------------------------ 133 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 134 CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 135 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 136 if (index >= size) return; 137 w[index] = x[index] * y[index]; 138 } 139 140 //------------------------------------------------------------------------------ 141 // Compute the pointwise multiplication w = x .* y on device 142 //------------------------------------------------------------------------------ 143 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 144 CeedScalar *y_array, CeedSize length) { 145 const int block_size = 512; 146 const CeedSize vec_size = length; 147 int grid_size = vec_size / block_size; 148 149 if (block_size * grid_size < vec_size) grid_size += 1; 150 pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 151 return 0; 152 } 153 154 //------------------------------------------------------------------------------ 155