1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for set value on device 13 //------------------------------------------------------------------------------ 14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedSize size, 15 CeedScalar val) { 16 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 17 if (index >= size) 18 return; 19 vec[index] = val; 20 } 21 22 //------------------------------------------------------------------------------ 23 // Set value on device memory 24 //------------------------------------------------------------------------------ 25 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedSize length, 26 CeedScalar val) { 27 const int block_size = 512; 28 const CeedSize vec_size = length; 29 int grid_size = vec_size / block_size; 30 31 if (block_size * grid_size < vec_size) 32 grid_size += 1; 33 setValueK<<<grid_size,block_size>>>(d_array, length, val); 34 return 0; 35 } 36 37 //------------------------------------------------------------------------------ 38 // Kernel for taking reciprocal 39 //------------------------------------------------------------------------------ 40 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedSize size) { 41 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 42 if (index >= size) 43 return; 44 if (fabs(vec[index]) > 1E-16) 45 vec[index] = 1./vec[index]; 46 } 47 48 //------------------------------------------------------------------------------ 49 // Take vector reciprocal in device memory 50 //------------------------------------------------------------------------------ 51 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedSize length) { 52 const int block_size = 512; 53 const CeedSize vec_size = length; 54 int grid_size = vec_size / block_size; 55 56 if (block_size * grid_size < vec_size) 57 grid_size += 1; 58 rcpValueK<<<grid_size,block_size>>>(d_array, length); 59 return 0; 60 } 61 62 //------------------------------------------------------------------------------ 63 // Kernel for scale 64 //------------------------------------------------------------------------------ 65 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 66 CeedSize size) { 67 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 68 if (index >= size) 69 return; 70 x[index] *= alpha; 71 } 72 73 //------------------------------------------------------------------------------ 74 // Compute x = alpha x on device 75 //------------------------------------------------------------------------------ 76 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 77 CeedSize length) { 78 const int block_size = 512; 79 const CeedSize vec_size = length; 80 int grid_size = vec_size / block_size; 81 82 if (block_size * grid_size < vec_size) 83 grid_size += 1; 84 scaleValueK<<<grid_size,block_size>>>(x_array, alpha, length); 85 return 0; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Kernel for axpy 90 //------------------------------------------------------------------------------ 91 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 92 CeedScalar * __restrict__ x, CeedSize size) { 93 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 94 if (index >= size) 95 return; 96 y[index] += alpha * x[index]; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Compute y = alpha x + y on device 101 //------------------------------------------------------------------------------ 102 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 103 CeedScalar *x_array, CeedSize length) { 104 const int block_size = 512; 105 const CeedSize vec_size = length; 106 int grid_size = vec_size / block_size; 107 108 if (block_size * grid_size < vec_size) 109 grid_size += 1; 110 axpyValueK<<<grid_size,block_size>>>(y_array, alpha, x_array, length); 111 return 0; 112 } 113 114 //------------------------------------------------------------------------------ 115 // Kernel for axpby 116 //------------------------------------------------------------------------------ 117 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 118 CeedScalar * __restrict__ x, CeedSize size) { 119 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 120 if (index >= size) 121 return; 122 y[index] = beta * y[index]; 123 y[index] += alpha * x[index]; 124 } 125 126 //------------------------------------------------------------------------------ 127 // Compute y = alpha x + beta y on device 128 //------------------------------------------------------------------------------ 129 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 130 CeedScalar *x_array, CeedSize length) { 131 const int block_size = 512; 132 const CeedSize vec_size = length; 133 int grid_size = vec_size / block_size; 134 135 if (block_size * grid_size < vec_size) 136 grid_size += 1; 137 axpbyValueK<<<grid_size,block_size>>>(y_array, alpha, beta, x_array, length); 138 return 0; 139 } 140 141 //------------------------------------------------------------------------------ 142 // Kernel for pointwise mult 143 //------------------------------------------------------------------------------ 144 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 145 CeedScalar * x, CeedScalar * __restrict__ y, CeedSize size) { 146 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 147 if (index >= size) 148 return; 149 w[index] = x[index] * y[index]; 150 } 151 152 //------------------------------------------------------------------------------ 153 // Compute the pointwise multiplication w = x .* y on device 154 //------------------------------------------------------------------------------ 155 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 156 CeedScalar *y_array, CeedSize length) { 157 const int block_size = 512; 158 const CeedSize vec_size = length; 159 int grid_size = vec_size / block_size; 160 161 if (block_size * grid_size < vec_size) 162 grid_size += 1; 163 pointwiseMultValueK<<<grid_size,block_size>>>(w_array, x_array, y_array, length); 164 return 0; 165 } 166 167 //------------------------------------------------------------------------------ 168