1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed/ceed.h> 9 #include <cuda.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for set value on device 13 //------------------------------------------------------------------------------ 14 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size, 15 CeedScalar val) { 16 int idx = threadIdx.x + blockDim.x * blockIdx.x; 17 if (idx >= size) 18 return; 19 vec[idx] = val; 20 } 21 22 //------------------------------------------------------------------------------ 23 // Set value on device memory 24 //------------------------------------------------------------------------------ 25 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedInt length, 26 CeedScalar val) { 27 const int bsize = 512; 28 const int vecsize = length; 29 int gridsize = vecsize / bsize; 30 31 if (bsize * gridsize < vecsize) 32 gridsize += 1; 33 setValueK<<<gridsize,bsize>>>(d_array, length, val); 34 return 0; 35 } 36 37 //------------------------------------------------------------------------------ 38 // Kernel for taking reciprocal 39 //------------------------------------------------------------------------------ 40 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) { 41 int idx = threadIdx.x + blockDim.x * blockIdx.x; 42 if (idx >= size) 43 return; 44 if (fabs(vec[idx]) > 1E-16) 45 vec[idx] = 1./vec[idx]; 46 } 47 48 //------------------------------------------------------------------------------ 49 // Take vector reciprocal in device memory 50 //------------------------------------------------------------------------------ 51 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedInt length) { 52 const int bsize = 512; 53 const int vecsize = length; 54 int gridsize = vecsize / bsize; 55 56 if (bsize * gridsize < vecsize) 57 gridsize += 1; 58 rcpValueK<<<gridsize,bsize>>>(d_array, length); 59 return 0; 60 } 61 62 //------------------------------------------------------------------------------ 63 // Kernel for scale 64 //------------------------------------------------------------------------------ 65 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 66 CeedInt size) { 67 int idx = threadIdx.x + blockDim.x * blockIdx.x; 68 if (idx >= size) 69 return; 70 x[idx] *= alpha; 71 } 72 73 //------------------------------------------------------------------------------ 74 // Compute x = alpha x on device 75 //------------------------------------------------------------------------------ 76 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 77 CeedInt length) { 78 const int bsize = 512; 79 const int vecsize = length; 80 int gridsize = vecsize / bsize; 81 82 if (bsize * gridsize < vecsize) 83 gridsize += 1; 84 scaleValueK<<<gridsize,bsize>>>(x_array, alpha, length); 85 return 0; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Kernel for axpy 90 //------------------------------------------------------------------------------ 91 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 92 CeedScalar * __restrict__ x, CeedInt size) { 93 int idx = threadIdx.x + blockDim.x * blockIdx.x; 94 if (idx >= size) 95 return; 96 y[idx] += alpha * x[idx]; 97 } 98 99 //------------------------------------------------------------------------------ 100 // Compute y = alpha x + y on device 101 //------------------------------------------------------------------------------ 102 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 103 CeedScalar *x_array, CeedInt length) { 104 const int bsize = 512; 105 const int vecsize = length; 106 int gridsize = vecsize / bsize; 107 108 if (bsize * gridsize < vecsize) 109 gridsize += 1; 110 axpyValueK<<<gridsize,bsize>>>(y_array, alpha, x_array, length); 111 return 0; 112 } 113 114 //------------------------------------------------------------------------------ 115 // Kernel for pointwise mult 116 //------------------------------------------------------------------------------ 117 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 118 CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) { 119 int idx = threadIdx.x + blockDim.x * blockIdx.x; 120 if (idx >= size) 121 return; 122 w[idx] = x[idx] * y[idx]; 123 } 124 125 //------------------------------------------------------------------------------ 126 // Compute the pointwise multiplication w = x .* y on device 127 //------------------------------------------------------------------------------ 128 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 129 CeedScalar *y_array, CeedInt length) { 130 const int bsize = 512; 131 const int vecsize = length; 132 int gridsize = vecsize / bsize; 133 134 if (bsize * gridsize < vecsize) 135 gridsize += 1; 136 pointwiseMultValueK<<<gridsize,bsize>>>(w_array, x_array, y_array, length); 137 return 0; 138 } 139