1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for copy strided on device 13 //------------------------------------------------------------------------------ 14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { 15 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16 if (index >= size) return; 17 if ((index - start) % step == 0) vec_copy[index] = vec[index]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // Copy strided on device memory 22 //------------------------------------------------------------------------------ 23 extern "C" int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { 24 const int block_size = 512; 25 const CeedSize vec_size = length; 26 int grid_size = vec_size / block_size; 27 28 if (block_size * grid_size < vec_size) grid_size += 1; 29 copyStridedK<<<grid_size, block_size>>>(d_array, start, step, length, d_copy_array); 30 return 0; 31 } 32 33 //------------------------------------------------------------------------------ 34 // Kernel for set value on device 35 //------------------------------------------------------------------------------ 36 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 37 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 38 if (index >= size) return; 39 vec[index] = val; 40 } 41 42 //------------------------------------------------------------------------------ 43 // Set value on device memory 44 //------------------------------------------------------------------------------ 45 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val) { 46 const int block_size = 512; 47 const CeedSize vec_size = length; 48 int grid_size = vec_size / block_size; 49 50 if (block_size * grid_size < vec_size) grid_size += 1; 51 setValueK<<<grid_size, block_size>>>(d_array, length, val); 52 return 0; 53 } 54 55 //------------------------------------------------------------------------------ 56 // Kernel for set value strided on device 57 //------------------------------------------------------------------------------ 58 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) { 59 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 60 if (index >= size) return; 61 if ((index - start) % step == 0) vec[index] = val; 62 } 63 64 //------------------------------------------------------------------------------ 65 // Set value strided on device memory 66 //------------------------------------------------------------------------------ 67 extern "C" int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) { 68 const int block_size = 512; 69 const CeedSize vec_size = length; 70 int grid_size = vec_size / block_size; 71 72 if (block_size * grid_size < vec_size) grid_size += 1; 73 setValueStridedK<<<grid_size, block_size>>>(d_array, start, step, length, val); 74 return 0; 75 } 76 77 //------------------------------------------------------------------------------ 78 // Kernel for taking reciprocal 79 //------------------------------------------------------------------------------ 80 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 81 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 82 if (index >= size) return; 83 if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 84 } 85 86 //------------------------------------------------------------------------------ 87 // Take vector reciprocal in device memory 88 //------------------------------------------------------------------------------ 89 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length) { 90 const int block_size = 512; 91 const CeedSize vec_size = length; 92 int grid_size = vec_size / block_size; 93 94 if (block_size * grid_size < vec_size) grid_size += 1; 95 rcpValueK<<<grid_size, block_size>>>(d_array, length); 96 return 0; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Kernel for scale 101 //------------------------------------------------------------------------------ 102 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 103 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 104 if (index >= size) return; 105 x[index] *= alpha; 106 } 107 108 //------------------------------------------------------------------------------ 109 // Compute x = alpha x on device 110 //------------------------------------------------------------------------------ 111 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 112 const int block_size = 512; 113 const CeedSize vec_size = length; 114 int grid_size = vec_size / block_size; 115 116 if (block_size * grid_size < vec_size) grid_size += 1; 117 scaleValueK<<<grid_size, block_size>>>(x_array, alpha, length); 118 return 0; 119 } 120 121 //------------------------------------------------------------------------------ 122 // Kernel for axpy 123 //------------------------------------------------------------------------------ 124 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 125 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 126 if (index >= size) return; 127 y[index] += alpha * x[index]; 128 } 129 130 //------------------------------------------------------------------------------ 131 // Compute y = alpha x + y on device 132 //------------------------------------------------------------------------------ 133 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 134 const int block_size = 512; 135 const CeedSize vec_size = length; 136 int grid_size = vec_size / block_size; 137 138 if (block_size * grid_size < vec_size) grid_size += 1; 139 axpyValueK<<<grid_size, block_size>>>(y_array, alpha, x_array, length); 140 return 0; 141 } 142 143 //------------------------------------------------------------------------------ 144 // Kernel for axpby 145 //------------------------------------------------------------------------------ 146 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 147 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 148 if (index >= size) return; 149 y[index] = beta * y[index]; 150 y[index] += alpha * x[index]; 151 } 152 153 //------------------------------------------------------------------------------ 154 // Compute y = alpha x + beta y on device 155 //------------------------------------------------------------------------------ 156 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 157 const int block_size = 512; 158 const CeedSize vec_size = length; 159 int grid_size = vec_size / block_size; 160 161 if (block_size * grid_size < vec_size) grid_size += 1; 162 axpbyValueK<<<grid_size, block_size>>>(y_array, alpha, beta, x_array, length); 163 return 0; 164 } 165 166 //------------------------------------------------------------------------------ 167 // Kernel for pointwise mult 168 //------------------------------------------------------------------------------ 169 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 170 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 171 if (index >= size) return; 172 w[index] = x[index] * y[index]; 173 } 174 175 //------------------------------------------------------------------------------ 176 // Compute the pointwise multiplication w = x .* y on device 177 //------------------------------------------------------------------------------ 178 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 179 const int block_size = 512; 180 const CeedSize vec_size = length; 181 int grid_size = vec_size / block_size; 182 183 if (block_size * grid_size < vec_size) grid_size += 1; 184 pointwiseMultValueK<<<grid_size, block_size>>>(w_array, x_array, y_array, length); 185 return 0; 186 } 187 188 //------------------------------------------------------------------------------ 189