1*0d0321e0SJeremy L Thompson // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2*0d0321e0SJeremy L Thompson // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3*0d0321e0SJeremy L Thompson // reserved. See files LICENSE and NOTICE for details. 4*0d0321e0SJeremy L Thompson // 5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 9*0d0321e0SJeremy L Thompson // 10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 16*0d0321e0SJeremy L Thompson 17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h> 18*0d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 19*0d0321e0SJeremy L Thompson 20*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 21*0d0321e0SJeremy L Thompson // Kernel for set value on device 22*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 23*0d0321e0SJeremy L Thompson __global__ static void setValueK(CeedScalar * __restrict__ vec, CeedInt size, 24*0d0321e0SJeremy L Thompson CeedScalar val) { 25*0d0321e0SJeremy L Thompson int idx = threadIdx.x + blockDim.x * blockIdx.x; 26*0d0321e0SJeremy L Thompson if (idx >= size) 27*0d0321e0SJeremy L Thompson return; 28*0d0321e0SJeremy L Thompson vec[idx] = val; 29*0d0321e0SJeremy L Thompson } 30*0d0321e0SJeremy L Thompson 31*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 32*0d0321e0SJeremy L Thompson // Set value on device memory 33*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 34*0d0321e0SJeremy L Thompson extern "C" int CeedDeviceSetValue_Hip(CeedScalar* d_array, CeedInt length, 35*0d0321e0SJeremy L Thompson CeedScalar val) { 36*0d0321e0SJeremy L Thompson const int bsize = 512; 37*0d0321e0SJeremy L Thompson const int vecsize = length; 38*0d0321e0SJeremy L Thompson int gridsize = vecsize / bsize; 39*0d0321e0SJeremy L Thompson 40*0d0321e0SJeremy L Thompson if (bsize * gridsize < vecsize) 41*0d0321e0SJeremy L Thompson gridsize += 1; 42*0d0321e0SJeremy L Thompson hipLaunchKernelGGL(setValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length, val); 43*0d0321e0SJeremy L Thompson return 0; 44*0d0321e0SJeremy L Thompson } 45*0d0321e0SJeremy L Thompson 46*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 47*0d0321e0SJeremy L Thompson // Kernel for taking reciprocal 48*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 49*0d0321e0SJeremy L Thompson __global__ static void rcpValueK(CeedScalar * __restrict__ vec, CeedInt size) { 50*0d0321e0SJeremy L Thompson int idx = threadIdx.x + blockDim.x * blockIdx.x; 51*0d0321e0SJeremy L Thompson if (idx >= size) 52*0d0321e0SJeremy L Thompson return; 53*0d0321e0SJeremy L Thompson if (fabs(vec[idx]) > 1E-16) 54*0d0321e0SJeremy L Thompson vec[idx] = 1./vec[idx]; 55*0d0321e0SJeremy L Thompson } 56*0d0321e0SJeremy L Thompson 57*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 58*0d0321e0SJeremy L Thompson // Take vector reciprocal in device memory 59*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 60*0d0321e0SJeremy L Thompson extern "C" int CeedDeviceReciprocal_Hip(CeedScalar* d_array, CeedInt length) { 61*0d0321e0SJeremy L Thompson const int bsize = 512; 62*0d0321e0SJeremy L Thompson const int vecsize = length; 63*0d0321e0SJeremy L Thompson int gridsize = vecsize / bsize; 64*0d0321e0SJeremy L Thompson 65*0d0321e0SJeremy L Thompson if (bsize * gridsize < vecsize) 66*0d0321e0SJeremy L Thompson gridsize += 1; 67*0d0321e0SJeremy L Thompson hipLaunchKernelGGL(rcpValueK, dim3(gridsize), dim3(bsize), 0, 0, d_array, length); 68*0d0321e0SJeremy L Thompson return 0; 69*0d0321e0SJeremy L Thompson } 70*0d0321e0SJeremy L Thompson 71*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 72*0d0321e0SJeremy L Thompson // Kernel for scale 73*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 74*0d0321e0SJeremy L Thompson __global__ static void scaleValueK(CeedScalar * __restrict__ x, CeedScalar alpha, 75*0d0321e0SJeremy L Thompson CeedInt size) { 76*0d0321e0SJeremy L Thompson int idx = threadIdx.x + blockDim.x * blockIdx.x; 77*0d0321e0SJeremy L Thompson if (idx >= size) 78*0d0321e0SJeremy L Thompson return; 79*0d0321e0SJeremy L Thompson x[idx] *= alpha; 80*0d0321e0SJeremy L Thompson } 81*0d0321e0SJeremy L Thompson 82*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 83*0d0321e0SJeremy L Thompson // Compute x = alpha x on device 84*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 85*0d0321e0SJeremy L Thompson extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, 86*0d0321e0SJeremy L Thompson CeedInt length) { 87*0d0321e0SJeremy L Thompson const int bsize = 512; 88*0d0321e0SJeremy L Thompson const int vecsize = length; 89*0d0321e0SJeremy L Thompson int gridsize = vecsize / bsize; 90*0d0321e0SJeremy L Thompson 91*0d0321e0SJeremy L Thompson if (bsize * gridsize < vecsize) 92*0d0321e0SJeremy L Thompson gridsize += 1; 93*0d0321e0SJeremy L Thompson hipLaunchKernelGGL(scaleValueK, dim3(gridsize), dim3(bsize), 0, 0, x_array, alpha, 94*0d0321e0SJeremy L Thompson length); 95*0d0321e0SJeremy L Thompson return 0; 96*0d0321e0SJeremy L Thompson } 97*0d0321e0SJeremy L Thompson 98*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 99*0d0321e0SJeremy L Thompson // Kernel for axpy 100*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 101*0d0321e0SJeremy L Thompson __global__ static void axpyValueK(CeedScalar * __restrict__ y, CeedScalar alpha, 102*0d0321e0SJeremy L Thompson CeedScalar * __restrict__ x, CeedInt size) { 103*0d0321e0SJeremy L Thompson int idx = threadIdx.x + blockDim.x * blockIdx.x; 104*0d0321e0SJeremy L Thompson if (idx >= size) 105*0d0321e0SJeremy L Thompson return; 106*0d0321e0SJeremy L Thompson y[idx] += alpha * x[idx]; 107*0d0321e0SJeremy L Thompson } 108*0d0321e0SJeremy L Thompson 109*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 110*0d0321e0SJeremy L Thompson // Compute y = alpha x + y on device 111*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 112*0d0321e0SJeremy L Thompson extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, 113*0d0321e0SJeremy L Thompson CeedScalar *x_array, CeedInt length) { 114*0d0321e0SJeremy L Thompson const int bsize = 512; 115*0d0321e0SJeremy L Thompson const int vecsize = length; 116*0d0321e0SJeremy L Thompson int gridsize = vecsize / bsize; 117*0d0321e0SJeremy L Thompson 118*0d0321e0SJeremy L Thompson if (bsize * gridsize < vecsize) 119*0d0321e0SJeremy L Thompson gridsize += 1; 120*0d0321e0SJeremy L Thompson hipLaunchKernelGGL(axpyValueK, dim3(gridsize), dim3(bsize), 0, 0, y_array, alpha, 121*0d0321e0SJeremy L Thompson x_array, length); 122*0d0321e0SJeremy L Thompson return 0; 123*0d0321e0SJeremy L Thompson } 124*0d0321e0SJeremy L Thompson 125*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 126*0d0321e0SJeremy L Thompson // Kernel for pointwise mult 127*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 128*0d0321e0SJeremy L Thompson __global__ static void pointwiseMultValueK(CeedScalar * __restrict__ w, 129*0d0321e0SJeremy L Thompson CeedScalar * x, CeedScalar * __restrict__ y, CeedInt size) { 130*0d0321e0SJeremy L Thompson int idx = threadIdx.x + blockDim.x * blockIdx.x; 131*0d0321e0SJeremy L Thompson if (idx >= size) 132*0d0321e0SJeremy L Thompson return; 133*0d0321e0SJeremy L Thompson w[idx] = x[idx] * y[idx]; 134*0d0321e0SJeremy L Thompson } 135*0d0321e0SJeremy L Thompson 136*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 137*0d0321e0SJeremy L Thompson // Compute the pointwise multiplication w = x .* y on device 138*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 139*0d0321e0SJeremy L Thompson extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, 140*0d0321e0SJeremy L Thompson CeedScalar *y_array, CeedInt length) { 141*0d0321e0SJeremy L Thompson const int bsize = 512; 142*0d0321e0SJeremy L Thompson const int vecsize = length; 143*0d0321e0SJeremy L Thompson int gridsize = vecsize / bsize; 144*0d0321e0SJeremy L Thompson 145*0d0321e0SJeremy L Thompson if (bsize * gridsize < vecsize) 146*0d0321e0SJeremy L Thompson gridsize += 1; 147*0d0321e0SJeremy L Thompson hipLaunchKernelGGL(pointwiseMultValueK, dim3(gridsize), dim3(bsize), 0, 0, w_array, 148*0d0321e0SJeremy L Thompson x_array, y_array, length); 149*0d0321e0SJeremy L Thompson return 0; 150*0d0321e0SJeremy L Thompson } 151