10d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 20d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 30d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 40d0321e0SJeremy L Thompson // 50d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 60d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 70d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 80d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 90d0321e0SJeremy L Thompson // 100d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 110d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 120d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 130d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 140d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 150d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 160d0321e0SJeremy L Thompson 170d0321e0SJeremy L Thompson #include <ceed/ceed.h> 180d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 200d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 210d0321e0SJeremy L Thompson #include <stdbool.h> 220d0321e0SJeremy L Thompson #include <stddef.h> 230d0321e0SJeremy L Thompson #include "ceed-hip-ref.h" 240d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h" 250d0321e0SJeremy L Thompson 260d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 270d0321e0SJeremy L Thompson // Apply restriction 280d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 290d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, 30*437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 310d0321e0SJeremy L Thompson int ierr; 320d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 330d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 340d0321e0SJeremy L Thompson Ceed ceed; 350d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 360d0321e0SJeremy L Thompson Ceed_Hip *data; 370d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 38*437930d1SJeremy L Thompson const CeedInt block_size = 64; 39*437930d1SJeremy L Thompson const CeedInt num_nodes = impl->num_nodes; 40*437930d1SJeremy L Thompson CeedInt num_elem, elem_size; 41*437930d1SJeremy L Thompson CeedElemRestrictionGetNumElements(r, &num_elem); 42*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 430d0321e0SJeremy L Thompson hipFunction_t kernel; 440d0321e0SJeremy L Thompson 450d0321e0SJeremy L Thompson // Get vectors 460d0321e0SJeremy L Thompson const CeedScalar *d_u; 470d0321e0SJeremy L Thompson CeedScalar *d_v; 480d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 49*437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 500d0321e0SJeremy L Thompson // Sum into for transpose mode, e-vec to l-vec 510d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 520d0321e0SJeremy L Thompson } else { 530d0321e0SJeremy L Thompson // Overwrite for notranspose mode, l-vec to e-vec 540d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 550d0321e0SJeremy L Thompson } 560d0321e0SJeremy L Thompson 570d0321e0SJeremy L Thompson // Restrict 58*437930d1SJeremy L Thompson if (t_mode == CEED_NOTRANSPOSE) { 590d0321e0SJeremy L Thompson // L-vector -> E-vector 600d0321e0SJeremy L Thompson if (impl->d_ind) { 610d0321e0SJeremy L Thompson // -- Offsets provided 62*437930d1SJeremy L Thompson kernel = impl->OffsetNoTranspose; 63*437930d1SJeremy L Thompson void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 64*437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 65*437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 66*437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 670d0321e0SJeremy L Thompson } else { 680d0321e0SJeremy L Thompson // -- Strided restriction 69*437930d1SJeremy L Thompson kernel = impl->StridedNoTranspose; 70*437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 71*437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 72*437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 73*437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 740d0321e0SJeremy L Thompson } 750d0321e0SJeremy L Thompson } else { 760d0321e0SJeremy L Thompson // E-vector -> L-vector 770d0321e0SJeremy L Thompson if (impl->d_ind) { 780d0321e0SJeremy L Thompson // -- Offsets provided 79*437930d1SJeremy L Thompson kernel = impl->OffsetTranspose; 80*437930d1SJeremy L Thompson void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, 81*437930d1SJeremy L Thompson &impl->d_t_offsets, &d_u, &d_v 820d0321e0SJeremy L Thompson }; 83*437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 84*437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 850d0321e0SJeremy L Thompson } else { 860d0321e0SJeremy L Thompson // -- Strided restriction 87*437930d1SJeremy L Thompson kernel = impl->StridedTranspose; 88*437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 89*437930d1SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), 90*437930d1SJeremy L Thompson block_size, args); CeedChkBackend(ierr); 910d0321e0SJeremy L Thompson } 920d0321e0SJeremy L Thompson } 930d0321e0SJeremy L Thompson 940d0321e0SJeremy L Thompson if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 950d0321e0SJeremy L Thompson *request = NULL; 960d0321e0SJeremy L Thompson 970d0321e0SJeremy L Thompson // Restore arrays 980d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 990d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 1000d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1010d0321e0SJeremy L Thompson } 1020d0321e0SJeremy L Thompson 1030d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1040d0321e0SJeremy L Thompson // Blocked not supported 1050d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1060d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, 107*437930d1SJeremy L Thompson CeedTransposeMode t_mode, CeedVector u, 1080d0321e0SJeremy L Thompson CeedVector v, CeedRequest *request) { 1090d0321e0SJeremy L Thompson // LCOV_EXCL_START 1100d0321e0SJeremy L Thompson int ierr; 1110d0321e0SJeremy L Thompson Ceed ceed; 1120d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1130d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 1140d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 1150d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 1160d0321e0SJeremy L Thompson } 1170d0321e0SJeremy L Thompson 1180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1190d0321e0SJeremy L Thompson // Get offsets 1200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1210d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, 1220d0321e0SJeremy L Thompson CeedMemType mtype, const CeedInt **offsets) { 1230d0321e0SJeremy L Thompson int ierr; 1240d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1250d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); 1260d0321e0SJeremy L Thompson 1270d0321e0SJeremy L Thompson switch (mtype) { 1280d0321e0SJeremy L Thompson case CEED_MEM_HOST: 1290d0321e0SJeremy L Thompson *offsets = impl->h_ind; 1300d0321e0SJeremy L Thompson break; 1310d0321e0SJeremy L Thompson case CEED_MEM_DEVICE: 1320d0321e0SJeremy L Thompson *offsets = impl->d_ind; 1330d0321e0SJeremy L Thompson break; 1340d0321e0SJeremy L Thompson } 1350d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1360d0321e0SJeremy L Thompson } 1370d0321e0SJeremy L Thompson 1380d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1390d0321e0SJeremy L Thompson // Destroy restriction 1400d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1410d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 1420d0321e0SJeremy L Thompson int ierr; 1430d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1440d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 1450d0321e0SJeremy L Thompson 1460d0321e0SJeremy L Thompson Ceed ceed; 1470d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1480d0321e0SJeremy L Thompson ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 1490d0321e0SJeremy L Thompson ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); 1500d0321e0SJeremy L Thompson ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr); 151*437930d1SJeremy L Thompson ierr = hipFree(impl->d_t_offsets); CeedChk_Hip(ceed, ierr); 152*437930d1SJeremy L Thompson ierr = hipFree(impl->d_t_indices); CeedChk_Hip(ceed, ierr); 153*437930d1SJeremy L Thompson ierr = hipFree(impl->d_l_vec_indices); CeedChk_Hip(ceed, ierr); 1540d0321e0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 155*437930d1SJeremy L Thompson 1560d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1570d0321e0SJeremy L Thompson } 1580d0321e0SJeremy L Thompson 1590d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1600d0321e0SJeremy L Thompson // Create transpose offsets and indices 1610d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1620d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, 1630d0321e0SJeremy L Thompson const CeedInt *indices) { 1640d0321e0SJeremy L Thompson int ierr; 1650d0321e0SJeremy L Thompson Ceed ceed; 1660d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 1670d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 1680d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 169*437930d1SJeremy L Thompson CeedInt num_elem, elem_size, l_size, num_comp; 170*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 171*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 172*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetLVectorSize(r, &l_size); CeedChkBackend(ierr); 173*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 1740d0321e0SJeremy L Thompson 175*437930d1SJeremy L Thompson // Count num_nodes 176*437930d1SJeremy L Thompson bool *is_node; 177*437930d1SJeremy L Thompson ierr = CeedCalloc(l_size, &is_node); CeedChkBackend(ierr); 178*437930d1SJeremy L Thompson const CeedInt size_indices = num_elem * elem_size; 179*437930d1SJeremy L Thompson for (CeedInt i = 0; i < size_indices; i++) 180*437930d1SJeremy L Thompson is_node[indices[i]] = 1; 181*437930d1SJeremy L Thompson CeedInt num_nodes = 0; 182*437930d1SJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) 183*437930d1SJeremy L Thompson num_nodes += is_node[i]; 184*437930d1SJeremy L Thompson impl->num_nodes = num_nodes; 1850d0321e0SJeremy L Thompson 1860d0321e0SJeremy L Thompson // L-vector offsets array 187*437930d1SJeremy L Thompson CeedInt *ind_to_offset, *l_vec_indices; 188*437930d1SJeremy L Thompson ierr = CeedCalloc(l_size, &ind_to_offset); CeedChkBackend(ierr); 189*437930d1SJeremy L Thompson ierr = CeedCalloc(num_nodes, &l_vec_indices); CeedChkBackend(ierr); 1900d0321e0SJeremy L Thompson CeedInt j = 0; 191*437930d1SJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) 192*437930d1SJeremy L Thompson if (is_node[i]) { 193*437930d1SJeremy L Thompson l_vec_indices[j] = i; 1940d0321e0SJeremy L Thompson ind_to_offset[i] = j++; 1950d0321e0SJeremy L Thompson } 196*437930d1SJeremy L Thompson ierr = CeedFree(&is_node); CeedChkBackend(ierr); 1970d0321e0SJeremy L Thompson 1980d0321e0SJeremy L Thompson // Compute transpose offsets and indices 199*437930d1SJeremy L Thompson const CeedInt size_offsets = num_nodes + 1; 200*437930d1SJeremy L Thompson CeedInt *t_offsets; 201*437930d1SJeremy L Thompson ierr = CeedCalloc(size_offsets, &t_offsets); CeedChkBackend(ierr); 202*437930d1SJeremy L Thompson CeedInt *t_indices; 203*437930d1SJeremy L Thompson ierr = CeedMalloc(size_indices, &t_indices); CeedChkBackend(ierr); 2040d0321e0SJeremy L Thompson // Count node multiplicity 205*437930d1SJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) 206*437930d1SJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) 207*437930d1SJeremy L Thompson ++t_offsets[ind_to_offset[indices[elem_size*e + i]] + 1]; 2080d0321e0SJeremy L Thompson // Convert to running sum 209*437930d1SJeremy L Thompson for (CeedInt i = 1; i < size_offsets; ++i) 210*437930d1SJeremy L Thompson t_offsets[i] += t_offsets[i-1]; 2110d0321e0SJeremy L Thompson // List all E-vec indices associated with L-vec node 212*437930d1SJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) { 213*437930d1SJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) { 214*437930d1SJeremy L Thompson const CeedInt lid = elem_size*e + i; 2150d0321e0SJeremy L Thompson const CeedInt gid = indices[lid]; 216*437930d1SJeremy L Thompson t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 2170d0321e0SJeremy L Thompson } 2180d0321e0SJeremy L Thompson } 2190d0321e0SJeremy L Thompson // Reset running sum 220*437930d1SJeremy L Thompson for (int i = size_offsets - 1; i > 0; --i) 221*437930d1SJeremy L Thompson t_offsets[i] = t_offsets[i - 1]; 222*437930d1SJeremy L Thompson t_offsets[0] = 0; 2230d0321e0SJeremy L Thompson 2240d0321e0SJeremy L Thompson // Copy data to device 2250d0321e0SJeremy L Thompson // -- L-vector indices 226*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_l_vec_indices, num_nodes*sizeof(CeedInt)); 2270d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 228*437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_l_vec_indices, l_vec_indices, 229*437930d1SJeremy L Thompson num_nodes*sizeof(CeedInt), hipMemcpyHostToDevice); 2300d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 2310d0321e0SJeremy L Thompson // -- Transpose offsets 232*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_t_offsets, size_offsets*sizeof(CeedInt)); 2330d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 234*437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets*sizeof(CeedInt), 2350d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 2360d0321e0SJeremy L Thompson // -- Transpose indices 237*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_t_indices, size_indices*sizeof(CeedInt)); 2380d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 239*437930d1SJeremy L Thompson ierr = hipMemcpy(impl->d_t_indices, t_indices, size_indices*sizeof(CeedInt), 2400d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 2410d0321e0SJeremy L Thompson 2420d0321e0SJeremy L Thompson // Cleanup 2430d0321e0SJeremy L Thompson ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); 244*437930d1SJeremy L Thompson ierr = CeedFree(&l_vec_indices); CeedChkBackend(ierr); 245*437930d1SJeremy L Thompson ierr = CeedFree(&t_offsets); CeedChkBackend(ierr); 246*437930d1SJeremy L Thompson ierr = CeedFree(&t_indices); CeedChkBackend(ierr); 247*437930d1SJeremy L Thompson 2480d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2490d0321e0SJeremy L Thompson } 2500d0321e0SJeremy L Thompson 2510d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2520d0321e0SJeremy L Thompson // Create restriction 2530d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2540d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, 2550d0321e0SJeremy L Thompson const CeedInt *indices, 2560d0321e0SJeremy L Thompson CeedElemRestriction r) { 2570d0321e0SJeremy L Thompson int ierr; 2580d0321e0SJeremy L Thompson Ceed ceed; 2590d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 2600d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 2610d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 262*437930d1SJeremy L Thompson CeedInt num_elem, num_comp, elem_size; 263*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &num_elem); CeedChkBackend(ierr); 264*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &num_comp); CeedChkBackend(ierr); 265*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elem_size); CeedChkBackend(ierr); 266*437930d1SJeremy L Thompson CeedInt size = num_elem * elem_size; 267*437930d1SJeremy L Thompson CeedInt strides[3] = {1, size, elem_size}; 268*437930d1SJeremy L Thompson CeedInt comp_stride = 1; 2690d0321e0SJeremy L Thompson 2700d0321e0SJeremy L Thompson // Stride data 271*437930d1SJeremy L Thompson bool is_strided; 272*437930d1SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(r, &is_strided); CeedChkBackend(ierr); 273*437930d1SJeremy L Thompson if (is_strided) { 274*437930d1SJeremy L Thompson bool has_backend_strides; 275*437930d1SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(r, &has_backend_strides); 2760d0321e0SJeremy L Thompson CeedChkBackend(ierr); 277*437930d1SJeremy L Thompson if (!has_backend_strides) { 2780d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); 2790d0321e0SJeremy L Thompson } 2800d0321e0SJeremy L Thompson } else { 281*437930d1SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(r, &comp_stride); CeedChkBackend(ierr); 2820d0321e0SJeremy L Thompson } 2830d0321e0SJeremy L Thompson 2840d0321e0SJeremy L Thompson impl->h_ind = NULL; 2850d0321e0SJeremy L Thompson impl->h_ind_allocated = NULL; 2860d0321e0SJeremy L Thompson impl->d_ind = NULL; 2870d0321e0SJeremy L Thompson impl->d_ind_allocated = NULL; 288*437930d1SJeremy L Thompson impl->d_t_indices = NULL; 289*437930d1SJeremy L Thompson impl->d_t_offsets = NULL; 290*437930d1SJeremy L Thompson impl->num_nodes = size; 2910d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); 292*437930d1SJeremy L Thompson CeedInt layout[3] = {1, elem_size*num_elem, elem_size}; 2930d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); 2940d0321e0SJeremy L Thompson 2950d0321e0SJeremy L Thompson // Set up device indices/offset arrays 2960d0321e0SJeremy L Thompson if (mtype == CEED_MEM_HOST) { 2970d0321e0SJeremy L Thompson switch (cmode) { 2980d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 2990d0321e0SJeremy L Thompson impl->h_ind_allocated = (CeedInt *)indices; 3000d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 3010d0321e0SJeremy L Thompson break; 3020d0321e0SJeremy L Thompson case CEED_USE_POINTER: 3030d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 3040d0321e0SJeremy L Thompson break; 3050d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 3060d0321e0SJeremy L Thompson break; 3070d0321e0SJeremy L Thompson } 3080d0321e0SJeremy L Thompson if (indices != NULL) { 3090d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 3100d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3110d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 3120d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 3130d0321e0SJeremy L Thompson hipMemcpyHostToDevice); 3140d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3150d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 3160d0321e0SJeremy L Thompson } 3170d0321e0SJeremy L Thompson } else if (mtype == CEED_MEM_DEVICE) { 3180d0321e0SJeremy L Thompson switch (cmode) { 3190d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 3200d0321e0SJeremy L Thompson if (indices != NULL) { 3210d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 3220d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3230d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 3240d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 3250d0321e0SJeremy L Thompson hipMemcpyDeviceToDevice); 3260d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 3270d0321e0SJeremy L Thompson } 3280d0321e0SJeremy L Thompson break; 3290d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 3300d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 3310d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; 3320d0321e0SJeremy L Thompson break; 3330d0321e0SJeremy L Thompson case CEED_USE_POINTER: 3340d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 3350d0321e0SJeremy L Thompson } 3360d0321e0SJeremy L Thompson if (indices != NULL) { 3370d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 3380d0321e0SJeremy L Thompson } 3390d0321e0SJeremy L Thompson } else { 3400d0321e0SJeremy L Thompson // LCOV_EXCL_START 3410d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 3420d0321e0SJeremy L Thompson "Only MemType = HOST or DEVICE supported"); 3430d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 3440d0321e0SJeremy L Thompson } 3450d0321e0SJeremy L Thompson 3460d0321e0SJeremy L Thompson // Compile HIP kernels 347*437930d1SJeremy L Thompson CeedInt num_nodes = impl->num_nodes; 348*437930d1SJeremy L Thompson char *restriction_kernel_path, *restriction_kernel_source; 349*437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-ref-restriction.h", 350*437930d1SJeremy L Thompson &restriction_kernel_path); CeedChkBackend(ierr); 351*437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, restriction_kernel_path, 352*437930d1SJeremy L Thompson &restriction_kernel_source); 353*437930d1SJeremy L Thompson CeedChkBackend(ierr); 354*437930d1SJeremy L Thompson ierr = CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, 355*437930d1SJeremy L Thompson "RESTRICTION_ELEMSIZE", elem_size, 356*437930d1SJeremy L Thompson "RESTRICTION_NELEM", num_elem, 357*437930d1SJeremy L Thompson "RESTRICTION_NCOMP", num_comp, 358*437930d1SJeremy L Thompson "RESTRICTION_NNODES", num_nodes, 359*437930d1SJeremy L Thompson "RESTRICTION_COMPSTRIDE", comp_stride, 3600d0321e0SJeremy L Thompson "STRIDE_NODES", strides[0], 3610d0321e0SJeremy L Thompson "STRIDE_COMP", strides[1], 3620d0321e0SJeremy L Thompson "STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); 363*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", 364*437930d1SJeremy L Thompson &impl->StridedNoTranspose); CeedChkBackend(ierr); 365*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", 366*437930d1SJeremy L Thompson &impl->OffsetNoTranspose); CeedChkBackend(ierr); 367*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "StridedTranspose", 368*437930d1SJeremy L Thompson &impl->StridedTranspose); CeedChkBackend(ierr); 369*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", 370*437930d1SJeremy L Thompson &impl->OffsetTranspose); CeedChkBackend(ierr); 371*437930d1SJeremy L Thompson ierr = CeedFree(&restriction_kernel_path); CeedChkBackend(ierr); 372*437930d1SJeremy L Thompson ierr = CeedFree(&restriction_kernel_source); CeedChkBackend(ierr); 3730d0321e0SJeremy L Thompson 3740d0321e0SJeremy L Thompson // Register backend functions 3750d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", 3760d0321e0SJeremy L Thompson CeedElemRestrictionApply_Hip); 3770d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3780d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", 3790d0321e0SJeremy L Thompson CeedElemRestrictionApplyBlock_Hip); 3800d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3810d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", 3820d0321e0SJeremy L Thompson CeedElemRestrictionGetOffsets_Hip); 3830d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3840d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", 3850d0321e0SJeremy L Thompson CeedElemRestrictionDestroy_Hip); 3860d0321e0SJeremy L Thompson CeedChkBackend(ierr); 3870d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3880d0321e0SJeremy L Thompson } 3890d0321e0SJeremy L Thompson 3900d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3910d0321e0SJeremy L Thompson // Blocked not supported 3920d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3930d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, 3940d0321e0SJeremy L Thompson const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 3950d0321e0SJeremy L Thompson int ierr; 3960d0321e0SJeremy L Thompson Ceed ceed; 3970d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 3980d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 3990d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 4000d0321e0SJeremy L Thompson } 4010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 402