13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 30d0321e0SJeremy L Thompson // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 50d0321e0SJeremy L Thompson // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 70d0321e0SJeremy L Thompson 80d0321e0SJeremy L Thompson #include <ceed/backend.h> 9*2b730f8bSJeremy L Thompson #include <ceed/ceed.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 110d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 120d0321e0SJeremy L Thompson #include <stdbool.h> 130d0321e0SJeremy L Thompson #include <stddef.h> 1444d7a66cSJeremy L Thompson #include <string.h> 15*2b730f8bSJeremy L Thompson 160d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h" 17*2b730f8bSJeremy L Thompson #include "ceed-hip-ref.h" 180d0321e0SJeremy L Thompson 190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 200d0321e0SJeremy L Thompson // Apply restriction 210d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 22*2b730f8bSJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 230d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 24*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 250d0321e0SJeremy L Thompson Ceed ceed; 26*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 270d0321e0SJeremy L Thompson Ceed_Hip *data; 28*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &data)); 29437930d1SJeremy L Thompson const CeedInt block_size = 64; 30437930d1SJeremy L Thompson const CeedInt num_nodes = impl->num_nodes; 31437930d1SJeremy L Thompson CeedInt num_elem, elem_size; 32437930d1SJeremy L Thompson CeedElemRestrictionGetNumElements(r, &num_elem); 33*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 340d0321e0SJeremy L Thompson hipFunction_t kernel; 350d0321e0SJeremy L Thompson 360d0321e0SJeremy L Thompson // Get vectors 370d0321e0SJeremy L Thompson const CeedScalar *d_u; 380d0321e0SJeremy L Thompson CeedScalar *d_v; 39*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 40437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 410d0321e0SJeremy L Thompson // Sum into for transpose mode, e-vec to l-vec 42*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 430d0321e0SJeremy L Thompson } else { 440d0321e0SJeremy L Thompson // Overwrite for notranspose mode, l-vec to e-vec 45*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 460d0321e0SJeremy L Thompson } 470d0321e0SJeremy L Thompson 480d0321e0SJeremy L Thompson // Restrict 49437930d1SJeremy L Thompson if (t_mode == CEED_NOTRANSPOSE) { 500d0321e0SJeremy L Thompson // L-vector -> E-vector 510d0321e0SJeremy L Thompson if (impl->d_ind) { 520d0321e0SJeremy L Thompson // -- Offsets provided 53437930d1SJeremy L Thompson kernel = impl->OffsetNoTranspose; 54437930d1SJeremy L Thompson void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; 55437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 56*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 570d0321e0SJeremy L Thompson } else { 580d0321e0SJeremy L Thompson // -- Strided restriction 59437930d1SJeremy L Thompson kernel = impl->StridedNoTranspose; 60437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 61437930d1SJeremy L Thompson CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; 62*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 630d0321e0SJeremy L Thompson } 640d0321e0SJeremy L Thompson } else { 650d0321e0SJeremy L Thompson // E-vector -> L-vector 660d0321e0SJeremy L Thompson if (impl->d_ind) { 670d0321e0SJeremy L Thompson // -- Offsets provided 68437930d1SJeremy L Thompson kernel = impl->OffsetTranspose; 69*2b730f8bSJeremy L Thompson void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; 70*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 710d0321e0SJeremy L Thompson } else { 720d0321e0SJeremy L Thompson // -- Strided restriction 73437930d1SJeremy L Thompson kernel = impl->StridedTranspose; 74437930d1SJeremy L Thompson void *args[] = {&num_elem, &d_u, &d_v}; 75*2b730f8bSJeremy L Thompson CeedCallBackend(CeedRunKernelHip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); 760d0321e0SJeremy L Thompson } 770d0321e0SJeremy L Thompson } 780d0321e0SJeremy L Thompson 79*2b730f8bSJeremy L Thompson if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 800d0321e0SJeremy L Thompson 810d0321e0SJeremy L Thompson // Restore arrays 82*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 83*2b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 840d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 850d0321e0SJeremy L Thompson } 860d0321e0SJeremy L Thompson 870d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 880d0321e0SJeremy L Thompson // Blocked not supported 890d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 90*2b730f8bSJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 91*2b730f8bSJeremy L Thompson CeedRequest *request) { 920d0321e0SJeremy L Thompson // LCOV_EXCL_START 930d0321e0SJeremy L Thompson Ceed ceed; 94*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 95*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); 960d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 970d0321e0SJeremy L Thompson } 980d0321e0SJeremy L Thompson 990d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1000d0321e0SJeremy L Thompson // Get offsets 1010d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 102*2b730f8bSJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemType mtype, const CeedInt **offsets) { 1030d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 104*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 1050d0321e0SJeremy L Thompson 1060d0321e0SJeremy L Thompson switch (mtype) { 1070d0321e0SJeremy L Thompson case CEED_MEM_HOST: 1080d0321e0SJeremy L Thompson *offsets = impl->h_ind; 1090d0321e0SJeremy L Thompson break; 1100d0321e0SJeremy L Thompson case CEED_MEM_DEVICE: 1110d0321e0SJeremy L Thompson *offsets = impl->d_ind; 1120d0321e0SJeremy L Thompson break; 1130d0321e0SJeremy L Thompson } 1140d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1150d0321e0SJeremy L Thompson } 1160d0321e0SJeremy L Thompson 1170d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1180d0321e0SJeremy L Thompson // Destroy restriction 1190d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1200d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 1210d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 122*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 1230d0321e0SJeremy L Thompson 1240d0321e0SJeremy L Thompson Ceed ceed; 125*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 126*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(impl->module)); 127*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->h_ind_allocated)); 128*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); 129*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(impl->d_t_offsets)); 130*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(impl->d_t_indices)); 131*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); 132*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 133437930d1SJeremy L Thompson 1340d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1350d0321e0SJeremy L Thompson } 1360d0321e0SJeremy L Thompson 1370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 1380d0321e0SJeremy L Thompson // Create transpose offsets and indices 1390d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 140*2b730f8bSJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { 1410d0321e0SJeremy L Thompson Ceed ceed; 142*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 1430d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 144*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); 145e79b91d9SJeremy L Thompson CeedSize l_size; 146e79b91d9SJeremy L Thompson CeedInt num_elem, elem_size, num_comp; 147*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 148*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 149*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); 150*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 1510d0321e0SJeremy L Thompson 152437930d1SJeremy L Thompson // Count num_nodes 153437930d1SJeremy L Thompson bool *is_node; 154*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(l_size, &is_node)); 155437930d1SJeremy L Thompson const CeedInt size_indices = num_elem * elem_size; 156*2b730f8bSJeremy L Thompson for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; 157437930d1SJeremy L Thompson CeedInt num_nodes = 0; 158*2b730f8bSJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; 159437930d1SJeremy L Thompson impl->num_nodes = num_nodes; 1600d0321e0SJeremy L Thompson 1610d0321e0SJeremy L Thompson // L-vector offsets array 162437930d1SJeremy L Thompson CeedInt *ind_to_offset, *l_vec_indices; 163*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(l_size, &ind_to_offset)); 164*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(num_nodes, &l_vec_indices)); 1650d0321e0SJeremy L Thompson CeedInt j = 0; 166*2b730f8bSJeremy L Thompson for (CeedInt i = 0; i < l_size; i++) { 167437930d1SJeremy L Thompson if (is_node[i]) { 168437930d1SJeremy L Thompson l_vec_indices[j] = i; 1690d0321e0SJeremy L Thompson ind_to_offset[i] = j++; 1700d0321e0SJeremy L Thompson } 171*2b730f8bSJeremy L Thompson } 172*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&is_node)); 1730d0321e0SJeremy L Thompson 1740d0321e0SJeremy L Thompson // Compute transpose offsets and indices 175437930d1SJeremy L Thompson const CeedInt size_offsets = num_nodes + 1; 176437930d1SJeremy L Thompson CeedInt *t_offsets; 177*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(size_offsets, &t_offsets)); 178437930d1SJeremy L Thompson CeedInt *t_indices; 179*2b730f8bSJeremy L Thompson CeedCallBackend(CeedMalloc(size_indices, &t_indices)); 1800d0321e0SJeremy L Thompson // Count node multiplicity 181*2b730f8bSJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) { 182*2b730f8bSJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) ++t_offsets[ind_to_offset[indices[elem_size * e + i]] + 1]; 183*2b730f8bSJeremy L Thompson } 1840d0321e0SJeremy L Thompson // Convert to running sum 185*2b730f8bSJeremy L Thompson for (CeedInt i = 1; i < size_offsets; ++i) t_offsets[i] += t_offsets[i - 1]; 1860d0321e0SJeremy L Thompson // List all E-vec indices associated with L-vec node 187437930d1SJeremy L Thompson for (CeedInt e = 0; e < num_elem; ++e) { 188437930d1SJeremy L Thompson for (CeedInt i = 0; i < elem_size; ++i) { 189437930d1SJeremy L Thompson const CeedInt lid = elem_size * e + i; 1900d0321e0SJeremy L Thompson const CeedInt gid = indices[lid]; 191437930d1SJeremy L Thompson t_indices[t_offsets[ind_to_offset[gid]]++] = lid; 1920d0321e0SJeremy L Thompson } 1930d0321e0SJeremy L Thompson } 1940d0321e0SJeremy L Thompson // Reset running sum 195*2b730f8bSJeremy L Thompson for (int i = size_offsets - 1; i > 0; --i) t_offsets[i] = t_offsets[i - 1]; 196437930d1SJeremy L Thompson t_offsets[0] = 0; 1970d0321e0SJeremy L Thompson 1980d0321e0SJeremy L Thompson // Copy data to device 1990d0321e0SJeremy L Thompson // -- L-vector indices 200*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&impl->d_l_vec_indices, num_nodes * sizeof(CeedInt))); 201*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->d_l_vec_indices, l_vec_indices, num_nodes * sizeof(CeedInt), hipMemcpyHostToDevice)); 2020d0321e0SJeremy L Thompson // -- Transpose offsets 203*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_offsets, size_offsets * sizeof(CeedInt))); 204*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->d_t_offsets, t_offsets, size_offsets * sizeof(CeedInt), hipMemcpyHostToDevice)); 2050d0321e0SJeremy L Thompson // -- Transpose indices 206*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&impl->d_t_indices, size_indices * sizeof(CeedInt))); 207*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->d_t_indices, t_indices, size_indices * sizeof(CeedInt), hipMemcpyHostToDevice)); 2080d0321e0SJeremy L Thompson 2090d0321e0SJeremy L Thompson // Cleanup 210*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&ind_to_offset)); 211*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&l_vec_indices)); 212*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&t_offsets)); 213*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&t_indices)); 214437930d1SJeremy L Thompson 2150d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2160d0321e0SJeremy L Thompson } 2170d0321e0SJeremy L Thompson 2180d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 2190d0321e0SJeremy L Thompson // Create restriction 2200d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 221*2b730f8bSJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 2220d0321e0SJeremy L Thompson Ceed ceed; 223*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 2240d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 225*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 226437930d1SJeremy L Thompson CeedInt num_elem, num_comp, elem_size; 227*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); 228*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); 229*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); 230437930d1SJeremy L Thompson CeedInt size = num_elem * elem_size; 231437930d1SJeremy L Thompson CeedInt strides[3] = {1, size, elem_size}; 232437930d1SJeremy L Thompson CeedInt comp_stride = 1; 2330d0321e0SJeremy L Thompson 2340d0321e0SJeremy L Thompson // Stride data 235437930d1SJeremy L Thompson bool is_strided; 236*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); 237437930d1SJeremy L Thompson if (is_strided) { 238437930d1SJeremy L Thompson bool has_backend_strides; 239*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); 240437930d1SJeremy L Thompson if (!has_backend_strides) { 241*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); 2420d0321e0SJeremy L Thompson } 2430d0321e0SJeremy L Thompson } else { 244*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); 2450d0321e0SJeremy L Thompson } 2460d0321e0SJeremy L Thompson 2470d0321e0SJeremy L Thompson impl->h_ind = NULL; 2480d0321e0SJeremy L Thompson impl->h_ind_allocated = NULL; 2490d0321e0SJeremy L Thompson impl->d_ind = NULL; 2500d0321e0SJeremy L Thompson impl->d_ind_allocated = NULL; 251437930d1SJeremy L Thompson impl->d_t_indices = NULL; 252437930d1SJeremy L Thompson impl->d_t_offsets = NULL; 253437930d1SJeremy L Thompson impl->num_nodes = size; 254*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetData(r, impl)); 255437930d1SJeremy L Thompson CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; 256*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); 2570d0321e0SJeremy L Thompson 2580d0321e0SJeremy L Thompson // Set up device indices/offset arrays 2590d0321e0SJeremy L Thompson if (mtype == CEED_MEM_HOST) { 2600d0321e0SJeremy L Thompson switch (cmode) { 2610d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 2620d0321e0SJeremy L Thompson impl->h_ind_allocated = (CeedInt *)indices; 2630d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 2640d0321e0SJeremy L Thompson break; 2650d0321e0SJeremy L Thompson case CEED_USE_POINTER: 2660d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 2670d0321e0SJeremy L Thompson break; 2680d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 26944d7a66cSJeremy L Thompson if (indices != NULL) { 270*2b730f8bSJeremy L Thompson CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 27144d7a66cSJeremy L Thompson memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); 27244d7a66cSJeremy L Thompson impl->h_ind = impl->h_ind_allocated; 27344d7a66cSJeremy L Thompson } 2740d0321e0SJeremy L Thompson break; 2750d0321e0SJeremy L Thompson } 2760d0321e0SJeremy L Thompson if (indices != NULL) { 277*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 2780d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 279*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyHostToDevice)); 280*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); 2810d0321e0SJeremy L Thompson } 2820d0321e0SJeremy L Thompson } else if (mtype == CEED_MEM_DEVICE) { 2830d0321e0SJeremy L Thompson switch (cmode) { 2840d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 2850d0321e0SJeremy L Thompson if (indices != NULL) { 286*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); 2870d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 288*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyDeviceToDevice)); 2890d0321e0SJeremy L Thompson } 2900d0321e0SJeremy L Thompson break; 2910d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 2920d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 2930d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; 2940d0321e0SJeremy L Thompson break; 2950d0321e0SJeremy L Thompson case CEED_USE_POINTER: 2960d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 2970d0321e0SJeremy L Thompson } 2980d0321e0SJeremy L Thompson if (indices != NULL) { 299*2b730f8bSJeremy L Thompson CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); 300*2b730f8bSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost)); 30144d7a66cSJeremy L Thompson impl->h_ind = impl->h_ind_allocated; 302*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); 3030d0321e0SJeremy L Thompson } 3040d0321e0SJeremy L Thompson } else { 3050d0321e0SJeremy L Thompson // LCOV_EXCL_START 306*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); 3070d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 3080d0321e0SJeremy L Thompson } 3090d0321e0SJeremy L Thompson 3100d0321e0SJeremy L Thompson // Compile HIP kernels 311437930d1SJeremy L Thompson CeedInt num_nodes = impl->num_nodes; 312437930d1SJeremy L Thompson char *restriction_kernel_path, *restriction_kernel_source; 313*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); 31446dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source -----\n"); 315*2b730f8bSJeremy L Thompson CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); 316*2b730f8bSJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Restriction Kernel Source Complete! -----\n"); 317*2b730f8bSJeremy L Thompson CeedCallBackend(CeedCompileHip(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, 318*2b730f8bSJeremy L Thompson "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", 319*2b730f8bSJeremy L Thompson strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); 320*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); 321*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); 322*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); 323*2b730f8bSJeremy L Thompson CeedCallBackend(CeedGetKernelHip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); 324*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&restriction_kernel_path)); 325*2b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&restriction_kernel_source)); 3260d0321e0SJeremy L Thompson 3270d0321e0SJeremy L Thompson // Register backend functions 328*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Hip)); 329*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", CeedElemRestrictionApplyBlock_Hip)); 330*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); 331*2b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Hip)); 3320d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3330d0321e0SJeremy L Thompson } 3340d0321e0SJeremy L Thompson 3350d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 3360d0321e0SJeremy L Thompson // Blocked not supported 3370d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 338*2b730f8bSJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 3390d0321e0SJeremy L Thompson Ceed ceed; 340*2b730f8bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); 341*2b730f8bSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement blocked restrictions"); 3420d0321e0SJeremy L Thompson } 3430d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 344