1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 #include <ceed/backend.h> 10 #include <ceed/jit-tools.h> 11 #include <hip/hip_runtime.h> 12 #include <stdbool.h> 13 #include <stddef.h> 14 #include <string.h> 15 16 #include "../hip/ceed-hip-common.h" 17 #include "../hip/ceed-hip-compile.h" 18 #include "ceed-hip-ref.h" 19 20 //------------------------------------------------------------------------------ 21 // Apply restriction 22 //------------------------------------------------------------------------------ 23 static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 24 CeedElemRestriction_Hip *impl; 25 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 26 Ceed ceed; 27 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 28 Ceed_Hip *data; 29 CeedCallBackend(CeedGetData(ceed, &data)); 30 const CeedInt block_size = 64; 31 const CeedInt num_nodes = impl->num_nodes; 32 CeedInt num_elem, elem_size; 33 CeedElemRestrictionGetNumElements(r, &num_elem); 34 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 35 hipFunction_t kernel; 36 37 // Get vectors 38 const CeedScalar *d_u; 39 CeedScalar *d_v; 40 CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 41 if (t_mode == CEED_TRANSPOSE) { 42 // Sum into for transpose mode, e-vec to l-vec 43 CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 44 } else { 45 // Overwrite for notranspose mode, l-vec to e-vec 46 CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 47 } 48 49 // Restrict 50 if (t_mode == CEED_NOTRANSPOSE) { 51 // L-vector -> E-vector 52 if (impl->d_ind) { 53 // -- Offsets provided 54 kernel = impl->OffsetNoTranspose; 55 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 56 CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 57 CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 58 } else { 59 // -- Strided restriction 60 kernel = impl->StridedNoTranspose; 61 void *args[] = {&num_elem, &d_u, &d_v}; 62 CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 63 CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 64 } 65 } else { 66 // E-vector -> L-vector 67 if (impl->d_ind) { 68 // -- Offsets provided 69 kernel = impl->OffsetTranspose; 70 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 71 CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 72 } else { 73 // -- Strided restriction 74 kernel = impl->StridedTranspose; 75 void *args[] = {&num_elem, &d_u, &d_v}; 76 CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 77 } 78 } 79 80 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 81 82 // Restore arrays 83 CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 84 CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 85 return CEED_ERROR_SUCCESS; 86 } 87 88 //------------------------------------------------------------------------------ 89 // Blocked not supported 90 //------------------------------------------------------------------------------ 91 int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 92 CeedRequest *request) { 93 // LCOV_EXCL_START 94 Ceed ceed; 95 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 96 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); 97 // LCOV_EXCL_STOP 98 } 99 100 //------------------------------------------------------------------------------ 101 // Get offsets 102 //------------------------------------------------------------------------------ 103 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mtype, const CeedInt **offsets) { 104 CeedElemRestriction_Hip *impl; 105 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 106 107 switch (mtype) { 108 case CEED_MEM_HOST: 109 *offsets = impl->h_ind; 110 break; 111 case CEED_MEM_DEVICE: 112 *offsets = impl->d_ind; 113 break; 114 } 115 return CEED_ERROR_SUCCESS; 116 } 117 118 //------------------------------------------------------------------------------ 119 // Destroy restriction 120 //------------------------------------------------------------------------------ 121 static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 122 CeedElemRestriction_Hip *impl; 123 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 124 125 Ceed ceed; 126 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 127 CeedCallHip(ceed, hipModuleUnload(impl->module)); 128 CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 129 CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); 130 CeedCallHip(ceed, hipFree(impl->d_t_offsets)); 131 CeedCallHip(ceed, hipFree(impl->d_t_indices)); 132 CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); 133 CeedCallBackend(CeedFree(&impl)); 134 135 return CEED_ERROR_SUCCESS; 136 } 137 138 //------------------------------------------------------------------------------ 139 // Create transpose offsets and indices 140 //------------------------------------------------------------------------------ 141 static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { 142 Ceed ceed; 143 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 144 CeedElemRestriction_Hip *impl; 145 CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 146 CeedSize l_size; 147 CeedInt num_elem, elem_size, num_comp; 148 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 149 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 150 CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); 151 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 152 153 // Count num_nodes 154 bool *is_node; 155 CeedCallBackend(CeedCalloc(l_size, &is_node)); 156 const CeedInt size_indices = num_elem * elem_size; 157 for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 158 CeedInt num_nodes = 0; 159 for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 160 impl->num_nodes = num_nodes; 161 162 // L-vector offsets array 163 CeedInt *ind_to_offset, *l_vec_indices; 164 CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 165 CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 166 CeedInt j = 0; 167 for (CeedInt i = 0; i < l_size; i++) { 168 if (is_node[i]) { 169 l_vec_indices[j] = i; 170 ind_to_offset[i] = j++; 171 } 172 } 173 CeedCallBackend(CeedFree(&is_node)); 174 175 // Compute transpose offsets and indices 176 const CeedInt size_offsets = num_nodes + 1; 177 CeedInt *t_offsets; 178 CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 179 CeedInt *t_indices; 180 CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 181 // Count node multiplicity 182 for (CeedInt e = 0; e < num_elem; ++e) { 183 for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 184 } 185 // Convert to running sum 186 for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 187 // List all E-vec indices associated with L-vec node 188 for (CeedInt e = 0; e < num_elem; ++e) { 189 for (CeedInt i = 0; i < elem_size; ++i) { 190 const CeedInt lid = elem_size * e + i; 191 const CeedInt gid = indices[lid]; 192 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 193 } 194 } 195 // Reset running sum 196 for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 197 t_offsets[0] = 0; 198 199 // Copy data to device 200 // -- L-vector indices 201 CeedCallHip(ceed, hipMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 202 CeedCallHip(ceed, hipMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), hipMemcpyHostToDevice)); 203 // -- Transpose offsets 204 CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 205 CeedCallHip(ceed, hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), hipMemcpyHostToDevice)); 206 // -- Transpose indices 207 CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 208 CeedCallHip(ceed, hipMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), hipMemcpyHostToDevice)); 209 210 // Cleanup 211 CeedCallBackend(CeedFree(&ind_to_offset)); 212 CeedCallBackend(CeedFree(&l_vec_indices)); 213 CeedCallBackend(CeedFree(&t_offsets)); 214 CeedCallBackend(CeedFree(&t_indices)); 215 216 return CEED_ERROR_SUCCESS; 217 } 218 219 //------------------------------------------------------------------------------ 220 // Create restriction 221 //------------------------------------------------------------------------------ 222 int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 223 Ceed ceed; 224 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 225 CeedElemRestriction_Hip *impl; 226 CeedCallBackend(CeedCalloc(1, &impl)); 227 CeedInt num_elem, num_comp, elem_size; 228 CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 229 CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 230 CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 231 CeedInt size = num_elem * elem_size; 232 CeedInt strides[3] = {1, size, elem_size}; 233 CeedInt comp_stride = 1; 234 235 // Stride data 236 bool is_strided; 237 CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); 238 if (is_strided) { 239 bool has_backend_strides; 240 CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); 241 if (!has_backend_strides) { 242 CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); 243 } 244 } else { 245 CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); 246 } 247 248 impl->h_ind = NULL; 249 impl->h_ind_allocated = NULL; 250 impl->d_ind = NULL; 251 impl->d_ind_allocated = NULL; 252 impl->d_t_indices = NULL; 253 impl->d_t_offsets = NULL; 254 impl->num_nodes = size; 255 CeedCallBackend(CeedElemRestrictionSetData(r, impl)); 256 CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 257 CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); 258 259 // Set up device indices/offset arrays 260 switch (mtype) { 261 case CEED_MEM_HOST: { 262 switch (cmode) { 263 case CEED_OWN_POINTER: 264 impl->h_ind_allocated = (CeedInt *)indices; 265 impl->h_ind = (CeedInt *)indices; 266 break; 267 case CEED_USE_POINTER: 268 impl->h_ind = (CeedInt *)indices; 269 break; 270 case CEED_COPY_VALUES: 271 if (indices != NULL) { 272 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 273 memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); 274 impl->h_ind = impl->h_ind_allocated; 275 } 276 break; 277 } 278 if (indices != NULL) { 279 CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 280 impl->d_ind_allocated = impl->d_ind; // We own the device memory 281 CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyHostToDevice)); 282 CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); 283 } 284 break; 285 } 286 case CEED_MEM_DEVICE: { 287 switch (cmode) { 288 case CEED_COPY_VALUES: 289 if (indices != NULL) { 290 CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 291 impl->d_ind_allocated = impl->d_ind; // We own the device memory 292 CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyDeviceToDevice)); 293 } 294 break; 295 case CEED_OWN_POINTER: 296 impl->d_ind = (CeedInt *)indices; 297 impl->d_ind_allocated = impl->d_ind; 298 break; 299 case CEED_USE_POINTER: 300 impl->d_ind = (CeedInt *)indices; 301 } 302 if (indices != NULL) { 303 CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 304 CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost)); 305 impl->h_ind = impl->h_ind_allocated; 306 CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); 307 } 308 break; 309 } 310 // LCOV_EXCL_START 311 default: 312 return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); 313 // LCOV_EXCL_STOP 314 } 315 316 // Compile HIP kernels 317 CeedInt num_nodes = impl->num_nodes; 318 char *restriction_kernel_path, *restriction_kernel_source; 319 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); 320 CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); 321 CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 322 CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source Complete! -----\n"); 323 CeedCallBackend(CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, 324 "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", 325 strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); 326 CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 327 CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 328 CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 329 CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 330 CeedCallBackend(CeedFree(&restriction_kernel_path)); 331 CeedCallBackend(CeedFree(&restriction_kernel_source)); 332 333 // Register backend functions 334 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Hip)); 335 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Hip)); 336 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); 337 CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Hip)); 338 return CEED_ERROR_SUCCESS; 339 } 340 341 //------------------------------------------------------------------------------ 342 // Blocked not supported 343 //------------------------------------------------------------------------------ 344 int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 345 Ceed ceed; 346 CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 347 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); 348 } 349 //------------------------------------------------------------------------------ 350