1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for copy strided on device 13 //------------------------------------------------------------------------------ 14 __global__ static void copyStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step, 15 CeedSize size, CeedScalar * __restrict__ vec_copy) { 16 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17 if (index >= size) return; 18 if ((index - start) % step == 0) vec_copy[index] = vec[index]; 19 } 20 21 //------------------------------------------------------------------------------ 22 // Copy strided on device memory 23 //------------------------------------------------------------------------------ 24 extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step, 25 CeedSize length, CeedScalar* d_copy_array) { 26 const int block_size = 512; 27 const CeedSize vec_size = length; 28 int grid_size = vec_size / block_size; 29 30 if (block_size * grid_size < vec_size) grid_size += 1; 31 copyStridedK<<<grid_size,block_size>>>(d_array, start, step, length, d_copy_array); 32 return 0; 33 } 34 35 //------------------------------------------------------------------------------ 36 // Kernel for set value on device 37 //------------------------------------------------------------------------------ 38 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 39 CeedScalar val) { 40 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 41 if (index >= size) return; 42 vec[index] = val; 43 } 44 45 //------------------------------------------------------------------------------ 46 // Set value on device memory 47 //------------------------------------------------------------------------------ 48 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 49 CeedScalar val) { 50 const int block_size = 512; 51 const CeedSize vec_size = length; 52 int grid_size = vec_size / block_size; 53 54 if (block_size * grid_size < vec_size) grid_size += 1; 55 setValueK<<<grid_size,block_size>>>(d_array, length, val); 56 return 0; 57 } 58 59 //------------------------------------------------------------------------------ 60 // Kernel for set value strided on device 61 //------------------------------------------------------------------------------ 62 __global__ static void setValueStridedK(CeedScalar * __restrict__ vec, CeedSize start, CeedSize step, 63 CeedSize size, CeedScalar val) { 64 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 65 if (index >= size) return; 66 if ((index - start) % step == 0) vec[index] = val; 67 } 68 69 //------------------------------------------------------------------------------ 70 // Set value strided on device memory 71 //------------------------------------------------------------------------------ 72 extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar* d_array, CeedSize start, CeedSize step, 73 CeedSize length, CeedScalar val) { 74 const int block_size = 512; 75 const CeedSize vec_size = length; 76 int grid_size = vec_size / block_size; 77 78 if (block_size * grid_size < vec_size) grid_size += 1; 79 setValueStridedK<<<grid_size,block_size>>>(d_array, start, step, length, val); 80 return 0; 81 } 82 83 //------------------------------------------------------------------------------ 84 // Kernel for taking reciprocal 85 //------------------------------------------------------------------------------ 86 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 87 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 88 if (index >= size) return; 89 if (fabs(vec[index]) > 1E-16) vec[index] = 1./vec[index]; 90 } 91 92 //------------------------------------------------------------------------------ 93 // Take vector reciprocal in device memory 94 //------------------------------------------------------------------------------ 95 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 96 const int block_size = 512; 97 const CeedSize vec_size = length; 98 int grid_size = vec_size / block_size; 99 100 if (block_size * grid_size < vec_size) grid_size += 1; 101 rcpValueK<<<grid_size,block_size>>>(d_array, length); 102 return 0; 103 } 104 105 //------------------------------------------------------------------------------ 106 // Kernel for scale 107 //------------------------------------------------------------------------------ 108 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 109 CeedSize size) { 110 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 111 if (index >= size) return; 112 x[index] *= alpha; 113 } 114 115 //------------------------------------------------------------------------------ 116 // Compute x = alpha x on device 117 //------------------------------------------------------------------------------ 118 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 119 CeedSize length) { 120 const int block_size = 512; 121 const CeedSize vec_size = length; 122 int grid_size = vec_size / block_size; 123 124 if (block_size * grid_size < vec_size) grid_size += 1; 125 scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 126 return 0; 127 } 128 129 //------------------------------------------------------------------------------ 130 // Kernel for axpy 131 //------------------------------------------------------------------------------ 132 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 133 CeedScalar * __restrict__ x, CeedSize size) { 134 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 135 if (index >= size) return; 136 y[index] += alpha * x[index]; 137 } 138 139 //------------------------------------------------------------------------------ 140 // Compute y = alpha x + y on device 141 //------------------------------------------------------------------------------ 142 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 143 CeedScalar *x_array, CeedSize length) { 144 const int block_size = 512; 145 const CeedSize vec_size = length; 146 int grid_size = vec_size / block_size; 147 148 if (block_size * grid_size < vec_size) grid_size += 1; 149 axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 150 return 0; 151 } 152 153 //------------------------------------------------------------------------------ 154 // Kernel for axpby 155 //------------------------------------------------------------------------------ 156 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 157 CeedScalar * __restrict__ x, CeedSize size) { 158 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 159 if (index >= size) return; 160 y[index] = beta * y[index]; 161 y[index] += alpha * x[index]; 162 } 163 164 //------------------------------------------------------------------------------ 165 // Compute y = alpha x + beta y on device 166 //------------------------------------------------------------------------------ 167 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 168 CeedScalar *x_array, CeedSize length) { 169 const int block_size = 512; 170 const CeedSize vec_size = length; 171 int grid_size = vec_size / block_size; 172 173 if (block_size * grid_size < vec_size) grid_size += 1; 174 axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 175 return 0; 176 } 177 178 //------------------------------------------------------------------------------ 179 // Kernel for pointwise mult 180 //------------------------------------------------------------------------------ 181 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 182 CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 183 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 184 if (index >= size) return; 185 w[index] = x[index] * y[index]; 186 } 187 188 //------------------------------------------------------------------------------ 189 // Compute the pointwise multiplication w = x .* y on device 190 //------------------------------------------------------------------------------ 191 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 192 CeedScalar *y_array, CeedSize length) { 193 const int block_size = 512; 194 const CeedSize vec_size = length; 195 int grid_size = vec_size / block_size; 196 197 if (block_size * grid_size < vec_size) grid_size += 1; 198 pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 199 return 0; 200 } 201 202 //------------------------------------------------------------------------------ 203