1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for set value on device 13 //------------------------------------------------------------------------------ 14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size, 15 CeedScalar val) { 16 int idx = threadIdx.x + blockDim.x * blockIdx.x; 17 if (idx >= size) 18 return; 19 vec[idx] = val; 20 } 21 22 //------------------------------------------------------------------------------ 23 // Set value on device memory 24 //------------------------------------------------------------------------------ 25 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedInt length, 26 CeedScalar val) { 27 const int bsize = 512; 28 const int vecsize = length; 29 int gridsize = vecsize / bsize; 30 31 if (bsize * gridsize < vecsize) 32 gridsize += 1; 33 setValueK<<<gridsize,bsize>>>(d_array, length, val); 34 return 0; 35 } 36 37 //------------------------------------------------------------------------------ 38 // Kernel for taking reciprocal 39 //------------------------------------------------------------------------------ 40 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) { 41 int idx = threadIdx.x + blockDim.x * blockIdx.x; 42 if (idx >= size) 43 return; 44 if (fabs(vec[idx]) > 1E-16) 45 vec[idx] = 1./vec[idx]; 46 } 47 48 //------------------------------------------------------------------------------ 49 // Take vector reciprocal in device memory 50 //------------------------------------------------------------------------------ 51 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedInt length) { 52 const int bsize = 512; 53 const int vecsize = length; 54 int gridsize = vecsize / bsize; 55 56 if (bsize * gridsize < vecsize) 57 gridsize += 1; 58 rcpValueK<<<gridsize,bsize>>>(d_array, length); 59 return 0; 60 } 61 62 //------------------------------------------------------------------------------ 63 // Kernel for scale 64 //------------------------------------------------------------------------------ 65 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 66 CeedInt size) { 67 int idx = threadIdx.x + blockDim.x * blockIdx.x; 68 if (idx >= size) 69 return; 70 x[idx] *= alpha; 71 } 72 73 //------------------------------------------------------------------------------ 74 // Compute x = alpha x on device 75 //------------------------------------------------------------------------------ 76 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 77 CeedInt length) { 78 const int bsize = 512; 79 const int vecsize = length; 80 int gridsize = vecsize / bsize; 81 82 if (bsize * gridsize < vecsize) 83 gridsize += 1; 84 scaleValueK<<<gridsize,bsize>>>(x_array, alpha, length); 85 return 0; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Kernel for axpy 90 //------------------------------------------------------------------------------ 91 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 92 CeedScalar * __restrict__ x, CeedInt size) { 93 int idx = threadIdx.x + blockDim.x * blockIdx.x; 94 if (idx >= size) 95 return; 96 y[idx] += alpha * x[idx]; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Compute y = alpha x + y on device 101 //------------------------------------------------------------------------------ 102 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 103 CeedScalar *x_array, CeedInt length) { 104 const int bsize = 512; 105 const int vecsize = length; 106 int gridsize = vecsize / bsize; 107 108 if (bsize * gridsize < vecsize) 109 gridsize += 1; 110 axpyValueK<<<gridsize,bsize>>>(y_array, alpha, x_array, length); 111 return 0; 112 } 113 114 //------------------------------------------------------------------------------ 115 // Kernel for axpby 116 //------------------------------------------------------------------------------ 117 __global__ static void axpbyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, CeedScalar beta, 118 CeedScalar * __restrict__ x, CeedInt size) { 119 int idx = threadIdx.x + blockDim.x * blockIdx.x; 120 if (idx >= size) 121 return; 122 y[idx] = beta * y[idx]; 123 y[idx] += alpha * x[idx]; 124 } 125 126 //------------------------------------------------------------------------------ 127 // Compute y = alpha x + beta y on device 128 //------------------------------------------------------------------------------ 129 extern "C" int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, 130 CeedScalar *x_array, CeedInt length) { 131 const int bsize = 512; 132 const int vecsize = length; 133 int gridsize = vecsize / bsize; 134 135 if (bsize * gridsize < vecsize) 136 gridsize += 1; 137 axpbyValueK<<<gridsize,bsize>>>(y_array, alpha, beta, x_array, length); 138 return 0; 139 } 140 141 //------------------------------------------------------------------------------ 142 // Kernel for pointwise mult 143 //------------------------------------------------------------------------------ 144 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 145 CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) { 146 int idx = threadIdx.x + blockDim.x * blockIdx.x; 147 if (idx >= size) 148 return; 149 w[idx] = x[idx] * y[idx]; 150 } 151 152 //------------------------------------------------------------------------------ 153 // Compute the pointwise multiplication w = x .* y on device 154 //------------------------------------------------------------------------------ 155 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 156 CeedScalar *y_array, CeedInt length) { 157 const int bsize = 512; 158 const int vecsize = length; 159 int gridsize = vecsize / bsize; 160 161 if (bsize * gridsize < vecsize) 162 gridsize += 1; 163 pointwiseMultValueK<<<gridsize,bsize>>>(w_array, x_array, y_array, length); 164 return 0; 165 } 166