1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <hip/hip_runtime.h> 10 11 //------------------------------------------------------------------------------ 12 // Kernel for copy strided on device 13 //------------------------------------------------------------------------------ 14 __global__ static void copyStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar *__restrict__ vec_copy) { 15 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 16 17 if (index >= size) return; 18 if ((index - start) % step == 0) vec_copy[index] = vec[index]; 19 } 20 21 //------------------------------------------------------------------------------ 22 // Copy strided on device memory 23 //------------------------------------------------------------------------------ 24 extern "C" int CeedDeviceCopyStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array) { 25 const int block_size = 512; 26 const CeedSize vec_size = length; 27 int grid_size = vec_size / block_size; 28 29 if (block_size * grid_size < vec_size) grid_size += 1; 30 hipLaunchKernelGGL(copyStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, d_copy_array); 31 return 0; 32 } 33 34 //------------------------------------------------------------------------------ 35 // Kernel for set value on device 36 //------------------------------------------------------------------------------ 37 __global__ static void setValueK(CeedScalar *__restrict__ vec, CeedSize size, CeedScalar val) { 38 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 39 40 if (index >= size) return; 41 vec[index] = val; 42 } 43 44 //------------------------------------------------------------------------------ 45 // Set value on device memory 46 //------------------------------------------------------------------------------ 47 extern "C" int CeedDeviceSetValue_Hip(CeedScalar *d_array, CeedSize length, CeedScalar val) { 48 const int block_size = 512; 49 const CeedSize vec_size = length; 50 int grid_size = vec_size / block_size; 51 52 if (block_size * grid_size < vec_size) grid_size += 1; 53 hipLaunchKernelGGL(setValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length, val); 54 return 0; 55 } 56 57 //------------------------------------------------------------------------------ 58 // Kernel for set value strided on device 59 //------------------------------------------------------------------------------ 60 __global__ static void setValueStridedK(CeedScalar *__restrict__ vec, CeedSize start, CeedSize step, CeedSize size, CeedScalar val) { 61 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 62 63 if (index >= size) return; 64 if ((index - start) % step == 0) vec[index] = val; 65 } 66 67 //------------------------------------------------------------------------------ 68 // Set value strided on device memory 69 //------------------------------------------------------------------------------ 70 extern "C" int CeedDeviceSetValueStrided_Hip(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) { 71 const int block_size = 512; 72 const CeedSize vec_size = length; 73 int grid_size = vec_size / block_size; 74 75 if (block_size * grid_size < vec_size) grid_size += 1; 76 hipLaunchKernelGGL(setValueStridedK, dim3(grid_size), dim3(block_size), 0, 0, d_array, start, step, length, val); 77 return 0; 78 } 79 80 //------------------------------------------------------------------------------ 81 // Kernel for taking reciprocal 82 //------------------------------------------------------------------------------ 83 __global__ static void rcpValueK(CeedScalar *__restrict__ vec, CeedSize size) { 84 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 85 86 if (index >= size) return; 87 if (fabs(vec[index]) > 1E-16) vec[index] = 1. / vec[index]; 88 } 89 90 //------------------------------------------------------------------------------ 91 // Take vector reciprocal in device memory 92 //------------------------------------------------------------------------------ 93 extern "C" int CeedDeviceReciprocal_Hip(CeedScalar *d_array, CeedSize length) { 94 const int block_size = 512; 95 const CeedSize vec_size = length; 96 int grid_size = vec_size / block_size; 97 98 if (block_size * grid_size < vec_size) grid_size += 1; 99 hipLaunchKernelGGL(rcpValueK, dim3(grid_size), dim3(block_size), 0, 0, d_array, length); 100 return 0; 101 } 102 103 //------------------------------------------------------------------------------ 104 // Kernel for scale 105 //------------------------------------------------------------------------------ 106 __global__ static void scaleValueK(CeedScalar *__restrict__ x, CeedScalar alpha, CeedSize size) { 107 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 108 109 if (index >= size) return; 110 x[index] *= alpha; 111 } 112 113 //------------------------------------------------------------------------------ 114 // Compute x = alpha x on device 115 //------------------------------------------------------------------------------ 116 extern "C" int CeedDeviceScale_Hip(CeedScalar *x_array, CeedScalar alpha, CeedSize length) { 117 const int block_size = 512; 118 const CeedSize vec_size = length; 119 int grid_size = vec_size / block_size; 120 121 if (block_size * grid_size < vec_size) grid_size += 1; 122 hipLaunchKernelGGL(scaleValueK, dim3(grid_size), dim3(block_size), 0, 0, x_array, alpha, length); 123 return 0; 124 } 125 126 //------------------------------------------------------------------------------ 127 // Kernel for axpy 128 //------------------------------------------------------------------------------ 129 __global__ static void axpyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar *__restrict__ x, CeedSize size) { 130 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 131 132 if (index >= size) return; 133 y[index] += alpha * x[index]; 134 } 135 136 //------------------------------------------------------------------------------ 137 // Compute y = alpha x + y on device 138 //------------------------------------------------------------------------------ 139 extern "C" int CeedDeviceAXPY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) { 140 const int block_size = 512; 141 const CeedSize vec_size = length; 142 int grid_size = vec_size / block_size; 143 144 if (block_size * grid_size < vec_size) grid_size += 1; 145 hipLaunchKernelGGL(axpyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, x_array, length); 146 return 0; 147 } 148 149 //------------------------------------------------------------------------------ 150 // Kernel for axpby 151 //------------------------------------------------------------------------------ 152 __global__ static void axpbyValueK(CeedScalar *__restrict__ y, CeedScalar alpha, CeedScalar beta, CeedScalar *__restrict__ x, CeedSize size) { 153 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 154 155 if (index >= size) return; 156 y[index] = beta * y[index]; 157 y[index] += alpha * x[index]; 158 } 159 160 //------------------------------------------------------------------------------ 161 // Compute y = alpha x + beta y on device 162 //------------------------------------------------------------------------------ 163 extern "C" int CeedDeviceAXPBY_Hip(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) { 164 const int block_size = 512; 165 const CeedSize vec_size = length; 166 int grid_size = vec_size / block_size; 167 168 if (block_size * grid_size < vec_size) grid_size += 1; 169 hipLaunchKernelGGL(axpbyValueK, dim3(grid_size), dim3(block_size), 0, 0, y_array, alpha, beta, x_array, length); 170 return 0; 171 } 172 173 //------------------------------------------------------------------------------ 174 // Kernel for pointwise mult 175 //------------------------------------------------------------------------------ 176 __global__ static void pointwiseMultValueK(CeedScalar *__restrict__ w, CeedScalar *x, CeedScalar *__restrict__ y, CeedSize size) { 177 CeedSize index = threadIdx.x + (CeedSize)blockDim.x * blockIdx.x; 178 179 if (index >= size) return; 180 w[index] = x[index] * y[index]; 181 } 182 183 //------------------------------------------------------------------------------ 184 // Compute the pointwise multiplication w = x .* y on device 185 //------------------------------------------------------------------------------ 186 extern "C" int CeedDevicePointwiseMult_Hip(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) { 187 const int block_size = 512; 188 const CeedSize vec_size = length; 189 int grid_size = vec_size / block_size; 190 191 if (block_size * grid_size < vec_size) grid_size += 1; 192 hipLaunchKernelGGL(pointwiseMultValueK, dim3(grid_size), dim3(block_size), 0, 0, w_array, x_array, y_array, length); 193 return 0; 194 } 195 196 //------------------------------------------------------------------------------ 197