10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 40d0321e0SJeremy L Thompson // 50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 90d0321e0SJeremy L Thompson // 100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 160d0321e0SJeremy L Thompson 170d0321e0SJeremy L Thompson #include <ceed/ceed.h> 180d0321e0SJeremy L Thompson #include <ceed/backend.h> 19437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 200d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 210d0321e0SJeremy L Thompson #include <stdbool.h> 220d0321e0SJeremy L Thompson #include <stddef.h> 230d0321e0SJeremy L Thompson #include "ceed-hip-ref.h" 240d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h" 250d0321e0SJeremy L Thompson 260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 270d0321e0SJeremy L Thompson // Apply restriction 280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 290d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, 3046dc0734SJeremy L Thompson CeedTransposeMode t_mode, CeedVector u, 3146dc0734SJeremy L Thompson CeedVector v, CeedRequest *request) { 320d0321e0SJeremy L Thompson int ierr; 330d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 340d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 350d0321e0SJeremy L Thompson Ceed ceed; 360d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 370d0321e0SJeremy L Thompson Ceed_Hip *data; 380d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 39437930d1SJeremy L Thompson const CeedInt block_size = 64; 40437930d1SJeremy L Thompson const CeedInt num_nodes = impl->num_nodes; 41437930d1SJeremy L Thompson CeedInt num_elem, elem_size; 42437930d1SJeremy L Thompson CeedElemRestrictionGetNumElements(r, &num_elem); 43437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 440d0321e0SJeremy L Thompson hipFunction_t kernel; 450d0321e0SJeremy L Thompson 460d0321e0SJeremy L Thompson // Get vectors 470d0321e0SJeremy L Thompson const CeedScalar *d_u; 480d0321e0SJeremy L Thompson CeedScalar *d_v; 490d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 50437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 510d0321e0SJeremy L Thompson // Sum into for transpose mode, e-vec to l-vec 520d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 530d0321e0SJeremy L Thompson } else { 540d0321e0SJeremy L Thompson // Overwrite for notranspose mode, l-vec to e-vec 550d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 560d0321e0SJeremy L Thompson } 570d0321e0SJeremy L Thompson 580d0321e0SJeremy L Thompson // Restrict 59437930d1SJeremy L Thompson if (t_mode == CEED_NOTRANSPOSE) { 600d0321e0SJeremy L Thompson // L-vector -> E-vector 610d0321e0SJeremy L Thompson if (impl->d_ind) { 620d0321e0SJeremy L Thompson // -- Offsets provided 63437930d1SJeremy L Thompson kernel = impl->OffsetNoTranspose; 64437930d1SJeremy L Thompson void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 65437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 66437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 67437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 680d0321e0SJeremy L Thompson } else { 690d0321e0SJeremy L Thompson // -- Strided restriction 70437930d1SJeremy L Thompson kernel = impl->StridedNoTranspose; 71437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 72437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 73437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 74437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 750d0321e0SJeremy L Thompson } 760d0321e0SJeremy L Thompson } else { 770d0321e0SJeremy L Thompson // E-vector -> L-vector 780d0321e0SJeremy L Thompson if (impl->d_ind) { 790d0321e0SJeremy L Thompson // -- Offsets provided 80437930d1SJeremy L Thompson kernel = impl->OffsetTranspose; 81437930d1SJeremy L Thompson void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, 82437930d1SJeremy L Thompson &impl->d_t_offsets, &d_u, &d_v 830d0321e0SJeremy L Thompson }; 84437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 85437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 860d0321e0SJeremy L Thompson } else { 870d0321e0SJeremy L Thompson // -- Strided restriction 88437930d1SJeremy L Thompson kernel = impl->StridedTranspose; 89437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 90437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 91437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 920d0321e0SJeremy L Thompson } 930d0321e0SJeremy L Thompson } 940d0321e0SJeremy L Thompson 950d0321e0SJeremy L Thompson if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 960d0321e0SJeremy L Thompson *request = NULL; 970d0321e0SJeremy L Thompson 980d0321e0SJeremy L Thompson // Restore arrays 990d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 1000d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 1010d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1020d0321e0SJeremy L Thompson } 1030d0321e0SJeremy L Thompson 1040d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1050d0321e0SJeremy L Thompson // Blocked not supported 1060d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1070d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, 108437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedVector u, 1090d0321e0SJeremy L Thompson CeedVector v, CeedRequest *request) { 1100d0321e0SJeremy L Thompson // LCOV_EXCL_START 1110d0321e0SJeremy L Thompson int ierr; 1120d0321e0SJeremy L Thompson Ceed ceed; 1130d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1140d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1150d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 1160d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 1170d0321e0SJeremy L Thompson } 1180d0321e0SJeremy L Thompson 1190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1200d0321e0SJeremy L Thompson // Get offsets 1210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1220d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, 1230d0321e0SJeremy L Thompson CeedMemType mtype, const CeedInt **offsets) { 1240d0321e0SJeremy L Thompson int ierr; 1250d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1260d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); 1270d0321e0SJeremy L Thompson 1280d0321e0SJeremy L Thompson switch (mtype) { 1290d0321e0SJeremy L Thompson case CEED_MEM_HOST: 1300d0321e0SJeremy L Thompson *offsets = impl->h_ind; 1310d0321e0SJeremy L Thompson break; 1320d0321e0SJeremy L Thompson case CEED_MEM_DEVICE: 1330d0321e0SJeremy L Thompson *offsets = impl->d_ind; 1340d0321e0SJeremy L Thompson break; 1350d0321e0SJeremy L Thompson } 1360d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1370d0321e0SJeremy L Thompson } 1380d0321e0SJeremy L Thompson 1390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1400d0321e0SJeremy L Thompson // Destroy restriction 1410d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1420d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 1430d0321e0SJeremy L Thompson int ierr; 1440d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1450d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 1460d0321e0SJeremy L Thompson 1470d0321e0SJeremy L Thompson Ceed ceed; 1480d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1490d0321e0SJeremy L Thompson ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 1500d0321e0SJeremy L Thompson ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); 1510d0321e0SJeremy L Thompson ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr); 152437930d1SJeremy L Thompson ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr); 153437930d1SJeremy L Thompson ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr); 154437930d1SJeremy L Thompson ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr); 1550d0321e0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 156437930d1SJeremy L Thompson 1570d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1580d0321e0SJeremy L Thompson } 1590d0321e0SJeremy L Thompson 1600d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1610d0321e0SJeremy L Thompson // Create transpose offsets and indices 1620d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1630d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, 1640d0321e0SJeremy L Thompson const CeedInt *indices) { 1650d0321e0SJeremy L Thompson int ierr; 1660d0321e0SJeremy L Thompson Ceed ceed; 1670d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1680d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1690d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 170437930d1SJeremy L Thompson CeedInt num_elem, elem_size, l_size, num_comp; 171437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 172437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 173437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); 174437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 1750d0321e0SJeremy L Thompson 176437930d1SJeremy L Thompson // Count num_nodes 177437930d1SJeremy L Thompson bool *is_node; 178437930d1SJeremy L Thompson ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr); 179437930d1SJeremy L Thompson const CeedInt size_indices = num_elem * elem_size; 180437930d1SJeremy L Thompson for (CeedInt i = 0; i < size_indices; i++) 181437930d1SJeremy L Thompson is_node[indices[i]] = 1; 182437930d1SJeremy L Thompson CeedInt num_nodes = 0; 183437930d1SJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) 184437930d1SJeremy L Thompson num_nodes += is_node[i]; 185437930d1SJeremy L Thompson impl->num_nodes = num_nodes; 1860d0321e0SJeremy L Thompson 1870d0321e0SJeremy L Thompson // L-vector offsets array 188437930d1SJeremy L Thompson CeedInt *ind_to_offset, *l_vec_indices; 189437930d1SJeremy L Thompson ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr); 190437930d1SJeremy L Thompson ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr); 1910d0321e0SJeremy L Thompson CeedInt j = 0; 192437930d1SJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) 193437930d1SJeremy L Thompson if (is_node[i]) { 194437930d1SJeremy L Thompson l_vec_indices[j] = i; 1950d0321e0SJeremy L Thompson ind_to_offset[i] = j++; 1960d0321e0SJeremy L Thompson } 197437930d1SJeremy L Thompson ierr = CeedFree(&is_node); CeedChkBackend(ierr); 1980d0321e0SJeremy L Thompson 1990d0321e0SJeremy L Thompson // Compute transpose offsets and indices 200437930d1SJeremy L Thompson const CeedInt size_offsets = num_nodes + 1; 201437930d1SJeremy L Thompson CeedInt *t_offsets; 202437930d1SJeremy L Thompson ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr); 203437930d1SJeremy L Thompson CeedInt *t_indices; 204437930d1SJeremy L Thompson ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr); 2050d0321e0SJeremy L Thompson // Count node multiplicity 206437930d1SJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) 207437930d1SJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) 208437930d1SJeremy L Thompson ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1]; 2090d0321e0SJeremy L Thompson // Convert to running sum 210437930d1SJeremy L Thompson for (CeedInt i = 1; i < size_offsets; ++i) 211437930d1SJeremy L Thompson t_offsets[i] += t_offsets[i-1]; 2120d0321e0SJeremy L Thompson // List all E-vec indices associated with L-vec node 213437930d1SJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) { 214437930d1SJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) { 215437930d1SJeremy L Thompson const CeedInt lid = elem_size*e + i; 2160d0321e0SJeremy L Thompson const CeedInt gid = indices[lid]; 217437930d1SJeremy L Thompson t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 2180d0321e0SJeremy L Thompson } 2190d0321e0SJeremy L Thompson } 2200d0321e0SJeremy L Thompson // Reset running sum 221437930d1SJeremy L Thompson for (int i = size_offsets - 1; i > 0; --i) 222437930d1SJeremy L Thompson t_offsets[i] = t_offsets[i - 1]; 223437930d1SJeremy L Thompson t_offsets[0] = 0; 2240d0321e0SJeremy L Thompson 2250d0321e0SJeremy L Thompson // Copy data to device 2260d0321e0SJeremy L Thompson // -- L-vector indices 227437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt)); 2280d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 229437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices, 230437930d1SJeremy L Thompson num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice); 2310d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 2320d0321e0SJeremy L Thompson // -- Transpose offsets 233437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt)); 2340d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 235437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt), 2360d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 2370d0321e0SJeremy L Thompson // -- Transpose indices 238437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt)); 2390d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 240437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt), 2410d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 2420d0321e0SJeremy L Thompson 2430d0321e0SJeremy L Thompson // Cleanup 2440d0321e0SJeremy L Thompson ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); 245437930d1SJeremy L Thompson ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr); 246437930d1SJeremy L Thompson ierr = CeedFree(&t_offsets); CeedChkBackend(ierr); 247437930d1SJeremy L Thompson ierr = CeedFree(&t_indices); CeedChkBackend(ierr); 248437930d1SJeremy L Thompson 2490d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2500d0321e0SJeremy L Thompson } 2510d0321e0SJeremy L Thompson 2520d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2530d0321e0SJeremy L Thompson // Create restriction 2540d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2550d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, 2560d0321e0SJeremy L Thompson const CeedInt *indices, 2570d0321e0SJeremy L Thompson CeedElemRestriction r) { 2580d0321e0SJeremy L Thompson int ierr; 2590d0321e0SJeremy L Thompson Ceed ceed; 2600d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 2610d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 2620d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 263437930d1SJeremy L Thompson CeedInt num_elem, num_comp, elem_size; 264437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 265437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 266437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 267437930d1SJeremy L Thompson CeedInt size = num_elem * elem_size; 268437930d1SJeremy L Thompson CeedInt strides[3] = {1, size, elem_size}; 269437930d1SJeremy L Thompson CeedInt comp_stride = 1; 2700d0321e0SJeremy L Thompson 2710d0321e0SJeremy L Thompson // Stride data 272437930d1SJeremy L Thompson bool is_strided; 273437930d1SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); 274437930d1SJeremy L Thompson if (is_strided) { 275437930d1SJeremy L Thompson bool has_backend_strides; 276437930d1SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); 2770d0321e0SJeremy L Thompson CeedChkBackend(ierr); 278437930d1SJeremy L Thompson if (!has_backend_strides) { 2790d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); 2800d0321e0SJeremy L Thompson } 2810d0321e0SJeremy L Thompson } else { 282437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); 2830d0321e0SJeremy L Thompson } 2840d0321e0SJeremy L Thompson 2850d0321e0SJeremy L Thompson impl->h_ind = NULL; 2860d0321e0SJeremy L Thompson impl->h_ind_allocated = NULL; 2870d0321e0SJeremy L Thompson impl->d_ind = NULL; 2880d0321e0SJeremy L Thompson impl->d_ind_allocated = NULL; 289437930d1SJeremy L Thompson impl->d_t_indices = NULL; 290437930d1SJeremy L Thompson impl->d_t_offsets = NULL; 291437930d1SJeremy L Thompson impl->num_nodes = size; 2920d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); 293437930d1SJeremy L Thompson CeedInt layout[3] = {1, elem_size*num_elem, elem_size}; 2940d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); 2950d0321e0SJeremy L Thompson 2960d0321e0SJeremy L Thompson // Set up device indices/offset arrays 2970d0321e0SJeremy L Thompson if (mtype == CEED_MEM_HOST) { 2980d0321e0SJeremy L Thompson switch (cmode) { 2990d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 3000d0321e0SJeremy L Thompson impl->h_ind_allocated = (CeedInt *)indices; 3010d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 3020d0321e0SJeremy L Thompson break; 3030d0321e0SJeremy L Thompson case CEED_USE_POINTER: 3040d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 3050d0321e0SJeremy L Thompson break; 3060d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 3070d0321e0SJeremy L Thompson break; 3080d0321e0SJeremy L Thompson } 3090d0321e0SJeremy L Thompson if (indices != NULL) { 3100d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 3110d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3120d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 3130d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 3140d0321e0SJeremy L Thompson hipMemcpyHostToDevice); 3150d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3160d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 3170d0321e0SJeremy L Thompson } 3180d0321e0SJeremy L Thompson } else if (mtype == CEED_MEM_DEVICE) { 3190d0321e0SJeremy L Thompson switch (cmode) { 3200d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 3210d0321e0SJeremy L Thompson if (indices != NULL) { 3220d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 3230d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3240d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 3250d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 3260d0321e0SJeremy L Thompson hipMemcpyDeviceToDevice); 3270d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3280d0321e0SJeremy L Thompson } 3290d0321e0SJeremy L Thompson break; 3300d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 3310d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 3320d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; 3330d0321e0SJeremy L Thompson break; 3340d0321e0SJeremy L Thompson case CEED_USE_POINTER: 3350d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 3360d0321e0SJeremy L Thompson } 3370d0321e0SJeremy L Thompson if (indices != NULL) { 3380d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 3390d0321e0SJeremy L Thompson } 3400d0321e0SJeremy L Thompson } else { 3410d0321e0SJeremy L Thompson // LCOV_EXCL_START 3420d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 3430d0321e0SJeremy L Thompson "Only MemType = HOST or DEVICE supported"); 3440d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 3450d0321e0SJeremy L Thompson } 3460d0321e0SJeremy L Thompson 3470d0321e0SJeremy L Thompson // Compile HIP kernels 348437930d1SJeremy L Thompson CeedInt num_nodes = impl->num_nodes; 349437930d1SJeremy L Thompson char *restriction_kernel_path, *restriction_kernel_source; 350437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h", 351437930d1SJeremy L Thompson &restriction_kernel_path); CeedChkBackend(ierr); 35246dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); 353437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path, 354437930d1SJeremy L Thompson &restriction_kernel_source); 355437930d1SJeremy L Thompson CeedChkBackend(ierr); 35646dc0734SJeremy L Thompson CeedDebug256(ceed, 2, 35746dc0734SJeremy L Thompson "----- Loading Restriction Kernel Source Complete! -----\n"); 358437930d1SJeremy L Thompson ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, 359*d7d111ecSJeremy L Thompson "RESTR_ELEM_SIZE", elem_size, 360*d7d111ecSJeremy L Thompson "RESTR_NUM_ELEM", num_elem, 361*d7d111ecSJeremy L Thompson "RESTR_NUM_COMP", num_comp, 362*d7d111ecSJeremy L Thompson "RESTR_NUM_NODES", num_nodes, 363*d7d111ecSJeremy L Thompson "RESTR_COMP_STRIDE", comp_stride, 364*d7d111ecSJeremy L Thompson "RESTR_STRIDE_NODES", strides[0], 365*d7d111ecSJeremy L Thompson "RESTR_STRIDE_COMP", strides[1], 366*d7d111ecSJeremy L Thompson "RESTR_STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); 367437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", 368437930d1SJeremy L Thompson &impl->StridedNoTranspose); CeedChkBackend(ierr); 369437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", 370437930d1SJeremy L Thompson &impl->OffsetNoTranspose); CeedChkBackend(ierr); 371437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose", 372437930d1SJeremy L Thompson &impl->StridedTranspose); CeedChkBackend(ierr); 373437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", 374437930d1SJeremy L Thompson &impl->OffsetTranspose); CeedChkBackend(ierr); 375437930d1SJeremy L Thompson ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); 376437930d1SJeremy L Thompson ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); 3770d0321e0SJeremy L Thompson 3780d0321e0SJeremy L Thompson // Register backend functions 3790d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", 3800d0321e0SJeremy L Thompson CeedElemRestrictionApply_Hip); 3810d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3820d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", 3830d0321e0SJeremy L Thompson CeedElemRestrictionApplyBlock_Hip); 3840d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3850d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", 3860d0321e0SJeremy L Thompson CeedElemRestrictionGetOffsets_Hip); 3870d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3880d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", 3890d0321e0SJeremy L Thompson CeedElemRestrictionDestroy_Hip); 3900d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3910d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3920d0321e0SJeremy L Thompson } 3930d0321e0SJeremy L Thompson 3940d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3950d0321e0SJeremy L Thompson // Blocked not supported 3960d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3970d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, 3980d0321e0SJeremy L Thompson const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 3990d0321e0SJeremy L Thompson int ierr; 4000d0321e0SJeremy L Thompson Ceed ceed; 4010d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 4020d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 4030d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 4040d0321e0SJeremy L Thompson } 4050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 406