1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3 // reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <cuda.h> 19 20 //------------------------------------------------------------------------------ 21 // Kernel for set value on device 22 //------------------------------------------------------------------------------ 23 __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size, 24 CeedScalar val) { 25 int idx = threadIdx.x + blockDim.x * blockIdx.x; 26 if (idx >= size) 27 return; 28 vec[idx] = val; 29 } 30 31 //------------------------------------------------------------------------------ 32 // Set value on device memory 33 //------------------------------------------------------------------------------ 34 extern "C" int CeedDeviceSetValue_Cuda(CeedScalar* d_array, CeedInt length, 35 CeedScalar val) { 36 const int bsize = 512; 37 const int vecsize = length; 38 int gridsize = vecsize / bsize; 39 40 if (bsize * gridsize < vecsize) 41 gridsize += 1; 42 setValueK<<<gridsize,bsize>>>(d_array, length, val); 43 return 0; 44 } 45 46 //------------------------------------------------------------------------------ 47 // Kernel for taking reciprocal 48 //------------------------------------------------------------------------------ 49 __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) { 50 int idx = threadIdx.x + blockDim.x * blockIdx.x; 51 if (idx >= size) 52 return; 53 if (fabs(vec[idx]) > 1E-16) 54 vec[idx] = 1./vec[idx]; 55 } 56 57 //------------------------------------------------------------------------------ 58 // Take vector reciprocal in device memory 59 //------------------------------------------------------------------------------ 60 extern "C" int CeedDeviceReciprocal_Cuda(CeedScalar* d_array, CeedInt length) { 61 const int bsize = 512; 62 const int vecsize = length; 63 int gridsize = vecsize / bsize; 64 65 if (bsize * gridsize < vecsize) 66 gridsize += 1; 67 rcpValueK<<<gridsize,bsize>>>(d_array, length); 68 return 0; 69 } 70 71 //------------------------------------------------------------------------------ 72 // Kernel for scale 73 //------------------------------------------------------------------------------ 74 __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 75 CeedInt size) { 76 int idx = threadIdx.x + blockDim.x * blockIdx.x; 77 if (idx >= size) 78 return; 79 x[idx] *= alpha; 80 } 81 82 //------------------------------------------------------------------------------ 83 // Compute x = alpha x on device 84 //------------------------------------------------------------------------------ 85 extern "C" int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, 86 CeedInt length) { 87 const int bsize = 512; 88 const int vecsize = length; 89 int gridsize = vecsize / bsize; 90 91 if (bsize * gridsize < vecsize) 92 gridsize += 1; 93 scaleValueK<<<gridsize,bsize>>>(x_array, alpha, length); 94 return 0; 95 } 96 97 //------------------------------------------------------------------------------ 98 // Kernel for axpy 99 //------------------------------------------------------------------------------ 100 __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 101 CeedScalar * __restrict__ x, CeedInt size) { 102 int idx = threadIdx.x + blockDim.x * blockIdx.x; 103 if (idx >= size) 104 return; 105 y[idx] += alpha * x[idx]; 106 } 107 108 //------------------------------------------------------------------------------ 109 // Compute y = alpha x + y on device 110 //------------------------------------------------------------------------------ 111 extern "C" int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, 112 CeedScalar *x_array, CeedInt length) { 113 const int bsize = 512; 114 const int vecsize = length; 115 int gridsize = vecsize / bsize; 116 117 if (bsize * gridsize < vecsize) 118 gridsize += 1; 119 axpyValueK<<<gridsize,bsize>>>(y_array, alpha, x_array, length); 120 return 0; 121 } 122 123 //------------------------------------------------------------------------------ 124 // Kernel for pointwise mult 125 //------------------------------------------------------------------------------ 126 __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 127 CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) { 128 int idx = threadIdx.x + blockDim.x * blockIdx.x; 129 if (idx >= size) 130 return; 131 w[idx] = x[idx] * y[idx]; 132 } 133 134 //------------------------------------------------------------------------------ 135 // Compute the pointwise multiplication w = x .* y on device 136 //------------------------------------------------------------------------------ 137 extern "C" int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, 138 CeedScalar *y_array, CeedInt length) { 139 const int bsize = 512; 140 const int vecsize = length; 141 int gridsize = vecsize / bsize; 142 143 if (bsize * gridsize < vecsize) 144 gridsize += 1; 145 pointwiseMultValueK<<<gridsize,bsize>>>(w_array, x_array, y_array, length); 146 return 0; 147 } 148