1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <hip/hip_runtime.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for copy strided on device 13 //------------------------------------------------------------------------------ 14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { 15 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16 17 if (index < size) { 18 if ((index - start) % step == 0) vec_copy[index] = vec[index]; 19 } 20 } 21 22 //------------------------------------------------------------------------------ 23 // Copy strided on device memory 24 //------------------------------------------------------------------------------ 25 extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { 26 const int block_size = 512; 27 const CeedSize vec_size = length; 28 int grid_size = vec_size / block_size; 29 30 if (block_size * grid_size < vec_size) grid_size += 1; 31 hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array); 32 return 0; 33 } 34 35 //------------------------------------------------------------------------------ 36 // Kernel for set value on device 37 //------------------------------------------------------------------------------ 38 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 39 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 40 41 if (index < size) vec[index] = val; 42 } 43 44 //------------------------------------------------------------------------------ 45 // Set value on device memory 46 //------------------------------------------------------------------------------ 47 extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) { 48 const int block_size = 512; 49 const CeedSize vec_size = length; 50 int grid_size = vec_size / block_size; 51 52 if (block_size * grid_size < vec_size) grid_size += 1; 53 hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val); 54 return 0; 55 } 56 57 //------------------------------------------------------------------------------ 58 // Kernel for set value strided on device 59 //------------------------------------------------------------------------------ 60 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) { 61 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 62 63 if (index < size) { 64 if ((index - start) % step == 0) vec[index] = val; 65 } 66 } 67 68 //------------------------------------------------------------------------------ 69 // Set value strided on device memory 70 //------------------------------------------------------------------------------ 71 extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) { 72 const int block_size = 512; 73 const CeedSize vec_size = length; 74 int grid_size = vec_size / block_size; 75 76 if (block_size * grid_size < vec_size) grid_size += 1; 77 hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val); 78 return 0; 79 } 80 81 //------------------------------------------------------------------------------ 82 // Kernel for taking reciprocal 83 //------------------------------------------------------------------------------ 84 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 85 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 86 87 if (index < size) { 88 if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 89 } 90 } 91 92 //------------------------------------------------------------------------------ 93 // Take vector reciprocal in device memory 94 //------------------------------------------------------------------------------ 95 extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { 96 const int block_size = 512; 97 const CeedSize vec_size = length; 98 int grid_size = vec_size / block_size; 99 100 if (block_size * grid_size < vec_size) grid_size += 1; 101 hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length); 102 return 0; 103 } 104 105 //------------------------------------------------------------------------------ 106 // Kernel for scale 107 //------------------------------------------------------------------------------ 108 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 109 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 110 111 if (index < size) x[index] *= alpha; 112 } 113 114 //------------------------------------------------------------------------------ 115 // Compute x = alpha x on device 116 //------------------------------------------------------------------------------ 117 extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 118 const int block_size = 512; 119 const CeedSize vec_size = length; 120 int grid_size = vec_size / block_size; 121 122 if (block_size * grid_size < vec_size) grid_size += 1; 123 hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length); 124 return 0; 125 } 126 127 //------------------------------------------------------------------------------ 128 // Kernel for axpy 129 //------------------------------------------------------------------------------ 130 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 131 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 132 133 if (index < size) y[index] += alpha * x[index]; 134 } 135 136 //------------------------------------------------------------------------------ 137 // Compute y = alpha x + y on device 138 //------------------------------------------------------------------------------ 139 extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 140 const int block_size = 512; 141 const CeedSize vec_size = length; 142 int grid_size = vec_size / block_size; 143 144 if (block_size * grid_size < vec_size) grid_size += 1; 145 hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length); 146 return 0; 147 } 148 149 //------------------------------------------------------------------------------ 150 // Kernel for axpby 151 //------------------------------------------------------------------------------ 152 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 153 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 154 155 if (index < size) { 156 y[index] = beta * y[index]; 157 y[index] += alpha * x[index]; 158 } 159 } 160 161 //------------------------------------------------------------------------------ 162 // Compute y = alpha x + beta y on device 163 //------------------------------------------------------------------------------ 164 extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 165 const int block_size = 512; 166 const CeedSize vec_size = length; 167 int grid_size = vec_size / block_size; 168 169 if (block_size * grid_size < vec_size) grid_size += 1; 170 hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length); 171 return 0; 172 } 173 174 //------------------------------------------------------------------------------ 175 // Kernel for pointwise mult 176 //------------------------------------------------------------------------------ 177 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 178 const CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 179 180 if (index < size) w[index] = x[index] * y[index]; 181 } 182 183 //------------------------------------------------------------------------------ 184 // Compute the pointwise multiplication w = x .* y on device 185 //------------------------------------------------------------------------------ 186 extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 187 const int block_size = 512; 188 const CeedSize vec_size = length; 189 int grid_size = vec_size / block_size; 190 191 if (block_size * grid_size < vec_size) grid_size += 1; 192 hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length); 193 return 0; 194 } 195 196 //------------------------------------------------------------------------------ 197