1 // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2 // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3 // All Rights reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 #include <ceed/ceed.h> 18 #include <ceed/backend.h> 19 #include <ceed/jit-tools.h> 20 #include <hip/hip_runtime.h> 21 #include <stdbool.h> 22 #include <stddef.h> 23 #include "ceed-hip-ref.h" 24 #include "../hip/ceed-hip-compile.h" 25 26 //------------------------------------------------------------------------------ 27 // Apply restriction 28 //------------------------------------------------------------------------------ 29 static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, 30 CeedTransposeMode t_mode, CeedVector u, 31 CeedVector v, CeedRequest *request) { 32 int ierr; 33 CeedElemRestriction_Hip *impl; 34 ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 35 Ceed ceed; 36 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 37 Ceed_Hip *data; 38 ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 39 const CeedInt block_size = 64; 40 const CeedInt num_nodes = impl->num_nodes; 41 CeedInt num_elem, elem_size; 42 CeedElemRestrictionGetNumElements(r, &num_elem); 43 ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 44 hipFunction_t kernel; 45 46 // Get vectors 47 const CeedScalar *d_u; 48 CeedScalar *d_v; 49 ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 50 if (t_mode == CEED_TRANSPOSE) { 51 // Sum into for transpose mode, e-vec to l-vec 52 ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 53 } else { 54 // Overwrite for notranspose mode, l-vec to e-vec 55 ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 56 } 57 58 // Restrict 59 if (t_mode == CEED_NOTRANSPOSE) { 60 // L-vector -> E-vector 61 if (impl->d_ind) { 62 // -- Offsets provided 63 kernel = impl->OffsetNoTranspose; 64 void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 65 CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 66 ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 67 block_size, args); CeedChkBackend(ierr); 68 } else { 69 // -- Strided restriction 70 kernel = impl->StridedNoTranspose; 71 void *args[] = {&num_elem, &d_u, &d_v}; 72 CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 73 ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 74 block_size, args); CeedChkBackend(ierr); 75 } 76 } else { 77 // E-vector -> L-vector 78 if (impl->d_ind) { 79 // -- Offsets provided 80 kernel = impl->OffsetTranspose; 81 void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, 82 &impl->d_t_offsets, &d_u, &d_v 83 }; 84 ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 85 block_size, args); CeedChkBackend(ierr); 86 } else { 87 // -- Strided restriction 88 kernel = impl->StridedTranspose; 89 void *args[] = {&num_elem, &d_u, &d_v}; 90 ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 91 block_size, args); CeedChkBackend(ierr); 92 } 93 } 94 95 if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 96 *request = NULL; 97 98 // Restore arrays 99 ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 100 ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 101 return CEED_ERROR_SUCCESS; 102 } 103 104 //------------------------------------------------------------------------------ 105 // Blocked not supported 106 //------------------------------------------------------------------------------ 107 int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, 108 CeedTransposeMode t_mode, CeedVector u, 109 CeedVector v, CeedRequest *request) { 110 // LCOV_EXCL_START 111 int ierr; 112 Ceed ceed; 113 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 114 return CeedError(ceed, CEED_ERROR_BACKEND, 115 "Backend does not implement blocked restrictions"); 116 // LCOV_EXCL_STOP 117 } 118 119 //------------------------------------------------------------------------------ 120 // Get offsets 121 //------------------------------------------------------------------------------ 122 static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, 123 CeedMemType mtype, const CeedInt **offsets) { 124 int ierr; 125 CeedElemRestriction_Hip *impl; 126 ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); 127 128 switch (mtype) { 129 case CEED_MEM_HOST: 130 *offsets = impl->h_ind; 131 break; 132 case CEED_MEM_DEVICE: 133 *offsets = impl->d_ind; 134 break; 135 } 136 return CEED_ERROR_SUCCESS; 137 } 138 139 //------------------------------------------------------------------------------ 140 // Destroy restriction 141 //------------------------------------------------------------------------------ 142 static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 143 int ierr; 144 CeedElemRestriction_Hip *impl; 145 ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 146 147 Ceed ceed; 148 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 149 ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 150 ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); 151 ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr); 152 ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr); 153 ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr); 154 ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr); 155 ierr = CeedFree(&impl); CeedChkBackend(ierr); 156 157 return CEED_ERROR_SUCCESS; 158 } 159 160 //------------------------------------------------------------------------------ 161 // Create transpose offsets and indices 162 //------------------------------------------------------------------------------ 163 static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, 164 const CeedInt *indices) { 165 int ierr; 166 Ceed ceed; 167 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 168 CeedElemRestriction_Hip *impl; 169 ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 170 CeedInt num_elem, elem_size, l_size, num_comp; 171 ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 172 ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 173 ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); 174 ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 175 176 // Count num_nodes 177 bool *is_node; 178 ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr); 179 const CeedInt size_indices = num_elem * elem_size; 180 for (CeedInt i = 0; i < size_indices; i++) 181 is_node[indices[i]] = 1; 182 CeedInt num_nodes = 0; 183 for (CeedInt i = 0; i < l_size; i++) 184 num_nodes += is_node[i]; 185 impl->num_nodes = num_nodes; 186 187 // L-vector offsets array 188 CeedInt *ind_to_offset, *l_vec_indices; 189 ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr); 190 ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr); 191 CeedInt j = 0; 192 for (CeedInt i = 0; i < l_size; i++) 193 if (is_node[i]) { 194 l_vec_indices[j] = i; 195 ind_to_offset[i] = j++; 196 } 197 ierr = CeedFree(&is_node); CeedChkBackend(ierr); 198 199 // Compute transpose offsets and indices 200 const CeedInt size_offsets = num_nodes + 1; 201 CeedInt *t_offsets; 202 ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr); 203 CeedInt *t_indices; 204 ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr); 205 // Count node multiplicity 206 for (CeedInt e = 0; e < num_elem; ++e) 207 for (CeedInt i = 0; i < elem_size; ++i) 208 ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1]; 209 // Convert to running sum 210 for (CeedInt i = 1; i < size_offsets; ++i) 211 t_offsets[i] += t_offsets[i-1]; 212 // List all E-vec indices associated with L-vec node 213 for (CeedInt e = 0; e < num_elem; ++e) { 214 for (CeedInt i = 0; i < elem_size; ++i) { 215 const CeedInt lid = elem_size*e + i; 216 const CeedInt gid = indices[lid]; 217 t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 218 } 219 } 220 // Reset running sum 221 for (int i = size_offsets - 1; i > 0; --i) 222 t_offsets[i] = t_offsets[i - 1]; 223 t_offsets[0] = 0; 224 225 // Copy data to device 226 // -- L-vector indices 227 ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt)); 228 CeedChk_Hip(ceed, ierr); 229 ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices, 230 num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice); 231 CeedChk_Hip(ceed, ierr); 232 // -- Transpose offsets 233 ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt)); 234 CeedChk_Hip(ceed, ierr); 235 ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt), 236 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 237 // -- Transpose indices 238 ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt)); 239 CeedChk_Hip(ceed, ierr); 240 ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt), 241 hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 242 243 // Cleanup 244 ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); 245 ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr); 246 ierr = CeedFree(&t_offsets); CeedChkBackend(ierr); 247 ierr = CeedFree(&t_indices); CeedChkBackend(ierr); 248 249 return CEED_ERROR_SUCCESS; 250 } 251 252 //------------------------------------------------------------------------------ 253 // Create restriction 254 //------------------------------------------------------------------------------ 255 int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, 256 const CeedInt *indices, 257 CeedElemRestriction r) { 258 int ierr; 259 Ceed ceed; 260 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 261 CeedElemRestriction_Hip *impl; 262 ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 263 CeedInt num_elem, num_comp, elem_size; 264 ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 265 ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 266 ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 267 CeedInt size = num_elem * elem_size; 268 CeedInt strides[3] = {1, size, elem_size}; 269 CeedInt comp_stride = 1; 270 271 // Stride data 272 bool is_strided; 273 ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); 274 if (is_strided) { 275 bool has_backend_strides; 276 ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); 277 CeedChkBackend(ierr); 278 if (!has_backend_strides) { 279 ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); 280 } 281 } else { 282 ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); 283 } 284 285 impl->h_ind = NULL; 286 impl->h_ind_allocated = NULL; 287 impl->d_ind = NULL; 288 impl->d_ind_allocated = NULL; 289 impl->d_t_indices = NULL; 290 impl->d_t_offsets = NULL; 291 impl->num_nodes = size; 292 ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); 293 CeedInt layout[3] = {1, elem_size*num_elem, elem_size}; 294 ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); 295 296 // Set up device indices/offset arrays 297 if (mtype == CEED_MEM_HOST) { 298 switch (cmode) { 299 case CEED_OWN_POINTER: 300 impl->h_ind_allocated = (CeedInt *)indices; 301 impl->h_ind = (CeedInt *)indices; 302 break; 303 case CEED_USE_POINTER: 304 impl->h_ind = (CeedInt *)indices; 305 break; 306 case CEED_COPY_VALUES: 307 break; 308 } 309 if (indices != NULL) { 310 ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 311 CeedChk_Hip(ceed, ierr); 312 impl->d_ind_allocated = impl->d_ind; // We own the device memory 313 ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 314 hipMemcpyHostToDevice); 315 CeedChk_Hip(ceed, ierr); 316 ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 317 } 318 } else if (mtype == CEED_MEM_DEVICE) { 319 switch (cmode) { 320 case CEED_COPY_VALUES: 321 if (indices != NULL) { 322 ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 323 CeedChk_Hip(ceed, ierr); 324 impl->d_ind_allocated = impl->d_ind; // We own the device memory 325 ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 326 hipMemcpyDeviceToDevice); 327 CeedChk_Hip(ceed, ierr); 328 } 329 break; 330 case CEED_OWN_POINTER: 331 impl->d_ind = (CeedInt *)indices; 332 impl->d_ind_allocated = impl->d_ind; 333 break; 334 case CEED_USE_POINTER: 335 impl->d_ind = (CeedInt *)indices; 336 } 337 if (indices != NULL) { 338 ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 339 } 340 } else { 341 // LCOV_EXCL_START 342 return CeedError(ceed, CEED_ERROR_BACKEND, 343 "Only MemType = HOST or DEVICE supported"); 344 // LCOV_EXCL_STOP 345 } 346 347 // Compile HIP kernels 348 CeedInt num_nodes = impl->num_nodes; 349 char *restriction_kernel_path, *restriction_kernel_source; 350 ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h", 351 &restriction_kernel_path); CeedChkBackend(ierr); 352 CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); 353 ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path, 354 &restriction_kernel_source); 355 CeedChkBackend(ierr); 356 CeedDebug256(ceed, 2, 357 "----- Loading Restriction Kernel Source Complete! -----\n"); 358 ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, 359 "RESTR_ELEM_SIZE", elem_size, 360 "RESTR_NUM_ELEM", num_elem, 361 "RESTR_NUM_COMP", num_comp, 362 "RESTR_NUM_NODES", num_nodes, 363 "RESTR_COMP_STRIDE", comp_stride, 364 "RESTR_STRIDE_NODES", strides[0], 365 "RESTR_STRIDE_COMP", strides[1], 366 "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); 367 ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", 368 &impl->StridedNoTranspose); CeedChkBackend(ierr); 369 ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", 370 &impl->OffsetNoTranspose); CeedChkBackend(ierr); 371 ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose", 372 &impl->StridedTranspose); CeedChkBackend(ierr); 373 ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", 374 &impl->OffsetTranspose); CeedChkBackend(ierr); 375 ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); 376 ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); 377 378 // Register backend functions 379 ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", 380 CeedElemRestrictionApply_Hip); 381 CeedChkBackend(ierr); 382 ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", 383 CeedElemRestrictionApplyBlock_Hip); 384 CeedChkBackend(ierr); 385 ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", 386 CeedElemRestrictionGetOffsets_Hip); 387 CeedChkBackend(ierr); 388 ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", 389 CeedElemRestrictionDestroy_Hip); 390 CeedChkBackend(ierr); 391 return CEED_ERROR_SUCCESS; 392 } 393 394 //------------------------------------------------------------------------------ 395 // Blocked not supported 396 //------------------------------------------------------------------------------ 397 int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, 398 const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 399 int ierr; 400 Ceed ceed; 401 ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 402 return CeedError(ceed, CEED_ERROR_BACKEND, 403 "Backend does not implement blocked restrictions"); 404 } 405 //------------------------------------------------------------------------------ 406