1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 4*0d0321e0SJeremy L Thompson // 5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 9*0d0321e0SJeremy L Thompson // 10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 16*0d0321e0SJeremy L Thompson 17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h> 18*0d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*0d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 20*0d0321e0SJeremy L Thompson #include <stdbool.h> 21*0d0321e0SJeremy L Thompson #include <stddef.h> 22*0d0321e0SJeremy L Thompson #include "ceed-hip-ref.h" 23*0d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h" 24*0d0321e0SJeremy L Thompson 25*0d0321e0SJeremy L Thompson // *INDENT-OFF* 26*0d0321e0SJeremy L Thompson static const char *restrictionkernels = QUOTE( 27*0d0321e0SJeremy L Thompson 28*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 29*0d0321e0SJeremy L Thompson // L-vector -> E-vector, strided 30*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 31*0d0321e0SJeremy L Thompson extern "C" __global__ void noTrStrided(const CeedInt nelem, 32*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 33*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 34*0d0321e0SJeremy L Thompson for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; 35*0d0321e0SJeremy L Thompson node < nelem*RESTRICTION_ELEMSIZE; 36*0d0321e0SJeremy L Thompson node += blockDim.x * gridDim.x) { 37*0d0321e0SJeremy L Thompson const CeedInt locNode = node % RESTRICTION_ELEMSIZE; 38*0d0321e0SJeremy L Thompson const CeedInt elem = node / RESTRICTION_ELEMSIZE; 39*0d0321e0SJeremy L Thompson 40*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 41*0d0321e0SJeremy L Thompson v[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM + 42*0d0321e0SJeremy L Thompson elem*RESTRICTION_ELEMSIZE] = 43*0d0321e0SJeremy L Thompson u[locNode*STRIDE_NODES + comp*STRIDE_COMP + elem*STRIDE_ELEM]; 44*0d0321e0SJeremy L Thompson } 45*0d0321e0SJeremy L Thompson } 46*0d0321e0SJeremy L Thompson 47*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 48*0d0321e0SJeremy L Thompson // L-vector -> E-vector, offsets provided 49*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 50*0d0321e0SJeremy L Thompson extern "C" __global__ void noTrOffset(const CeedInt nelem, 51*0d0321e0SJeremy L Thompson const CeedInt *__restrict__ indices, 52*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 53*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 54*0d0321e0SJeremy L Thompson for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; 55*0d0321e0SJeremy L Thompson node < nelem*RESTRICTION_ELEMSIZE; 56*0d0321e0SJeremy L Thompson node += blockDim.x * gridDim.x) { 57*0d0321e0SJeremy L Thompson const CeedInt ind = indices[node]; 58*0d0321e0SJeremy L Thompson const CeedInt locNode = node % RESTRICTION_ELEMSIZE; 59*0d0321e0SJeremy L Thompson const CeedInt elem = node / RESTRICTION_ELEMSIZE; 60*0d0321e0SJeremy L Thompson 61*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 62*0d0321e0SJeremy L Thompson v[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM + 63*0d0321e0SJeremy L Thompson elem*RESTRICTION_ELEMSIZE] = 64*0d0321e0SJeremy L Thompson u[ind + comp*RESTRICTION_COMPSTRIDE]; 65*0d0321e0SJeremy L Thompson } 66*0d0321e0SJeremy L Thompson } 67*0d0321e0SJeremy L Thompson 68*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 69*0d0321e0SJeremy L Thompson // E-vector -> L-vector, strided 70*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 71*0d0321e0SJeremy L Thompson extern "C" __global__ void trStrided(const CeedInt nelem, 72*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { 73*0d0321e0SJeremy L Thompson for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; 74*0d0321e0SJeremy L Thompson node < nelem*RESTRICTION_ELEMSIZE; 75*0d0321e0SJeremy L Thompson node += blockDim.x * gridDim.x) { 76*0d0321e0SJeremy L Thompson const CeedInt locNode = node % RESTRICTION_ELEMSIZE; 77*0d0321e0SJeremy L Thompson const CeedInt elem = node / RESTRICTION_ELEMSIZE; 78*0d0321e0SJeremy L Thompson 79*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 80*0d0321e0SJeremy L Thompson v[locNode*STRIDE_NODES + comp*STRIDE_COMP + elem*STRIDE_ELEM] += 81*0d0321e0SJeremy L Thompson u[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM + 82*0d0321e0SJeremy L Thompson elem*RESTRICTION_ELEMSIZE]; 83*0d0321e0SJeremy L Thompson } 84*0d0321e0SJeremy L Thompson } 85*0d0321e0SJeremy L Thompson 86*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 87*0d0321e0SJeremy L Thompson // E-vector -> L-vector, offsets provided 88*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 89*0d0321e0SJeremy L Thompson extern "C" __global__ void trOffset(const CeedInt *__restrict__ lvec_indices, 90*0d0321e0SJeremy L Thompson const CeedInt *__restrict__ tindices, 91*0d0321e0SJeremy L Thompson const CeedInt *__restrict__ toffsets, 92*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 93*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 94*0d0321e0SJeremy L Thompson CeedScalar value[RESTRICTION_NCOMP]; 95*0d0321e0SJeremy L Thompson 96*0d0321e0SJeremy L Thompson for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; 97*0d0321e0SJeremy L Thompson i < RESTRICTION_NNODES; 98*0d0321e0SJeremy L Thompson i += blockDim.x * gridDim.x) { 99*0d0321e0SJeremy L Thompson const CeedInt ind = lvec_indices[i]; 100*0d0321e0SJeremy L Thompson const CeedInt rng1 = toffsets[i]; 101*0d0321e0SJeremy L Thompson const CeedInt rngN = toffsets[i+1]; 102*0d0321e0SJeremy L Thompson 103*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 104*0d0321e0SJeremy L Thompson value[comp] = 0.0; 105*0d0321e0SJeremy L Thompson 106*0d0321e0SJeremy L Thompson for (CeedInt j = rng1; j < rngN; ++j) { 107*0d0321e0SJeremy L Thompson const CeedInt tind = tindices[j]; 108*0d0321e0SJeremy L Thompson CeedInt locNode = tind % RESTRICTION_ELEMSIZE; 109*0d0321e0SJeremy L Thompson CeedInt elem = tind / RESTRICTION_ELEMSIZE; 110*0d0321e0SJeremy L Thompson 111*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 112*0d0321e0SJeremy L Thompson value[comp] += u[locNode + comp*RESTRICTION_ELEMSIZE*RESTRICTION_NELEM + 113*0d0321e0SJeremy L Thompson elem*RESTRICTION_ELEMSIZE]; 114*0d0321e0SJeremy L Thompson } 115*0d0321e0SJeremy L Thompson 116*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < RESTRICTION_NCOMP; ++comp) 117*0d0321e0SJeremy L Thompson v[ind + comp*RESTRICTION_COMPSTRIDE] += value[comp]; 118*0d0321e0SJeremy L Thompson } 119*0d0321e0SJeremy L Thompson } 120*0d0321e0SJeremy L Thompson 121*0d0321e0SJeremy L Thompson ); 122*0d0321e0SJeremy L Thompson // *INDENT-ON* 123*0d0321e0SJeremy L Thompson 124*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 125*0d0321e0SJeremy L Thompson // Apply restriction 126*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 127*0d0321e0SJeremy L Thompson static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, 128*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, CeedVector u, CeedVector v, CeedRequest *request) { 129*0d0321e0SJeremy L Thompson int ierr; 130*0d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 131*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 132*0d0321e0SJeremy L Thompson Ceed ceed; 133*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 134*0d0321e0SJeremy L Thompson Ceed_Hip *data; 135*0d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &data); CeedChkBackend(ierr); 136*0d0321e0SJeremy L Thompson const CeedInt blocksize = 64; 137*0d0321e0SJeremy L Thompson const CeedInt nnodes = impl->nnodes; 138*0d0321e0SJeremy L Thompson CeedInt nelem, elemsize; 139*0d0321e0SJeremy L Thompson CeedElemRestrictionGetNumElements(r, &nelem); 140*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr); 141*0d0321e0SJeremy L Thompson hipFunction_t kernel; 142*0d0321e0SJeremy L Thompson 143*0d0321e0SJeremy L Thompson // Get vectors 144*0d0321e0SJeremy L Thompson const CeedScalar *d_u; 145*0d0321e0SJeremy L Thompson CeedScalar *d_v; 146*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 147*0d0321e0SJeremy L Thompson if (tmode == CEED_TRANSPOSE) { 148*0d0321e0SJeremy L Thompson // Sum into for transpose mode, e-vec to l-vec 149*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 150*0d0321e0SJeremy L Thompson } else { 151*0d0321e0SJeremy L Thompson // Overwrite for notranspose mode, l-vec to e-vec 152*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 153*0d0321e0SJeremy L Thompson } 154*0d0321e0SJeremy L Thompson 155*0d0321e0SJeremy L Thompson // Restrict 156*0d0321e0SJeremy L Thompson if (tmode == CEED_NOTRANSPOSE) { 157*0d0321e0SJeremy L Thompson // L-vector -> E-vector 158*0d0321e0SJeremy L Thompson if (impl->d_ind) { 159*0d0321e0SJeremy L Thompson // -- Offsets provided 160*0d0321e0SJeremy L Thompson kernel = impl->noTrOffset; 161*0d0321e0SJeremy L Thompson void *args[] = {&nelem, &impl->d_ind, &d_u, &d_v}; 162*0d0321e0SJeremy L Thompson CeedInt blocksize = elemsize<256?(elemsize>64?elemsize:64):256; 163*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize), 164*0d0321e0SJeremy L Thompson blocksize, args); CeedChkBackend(ierr); 165*0d0321e0SJeremy L Thompson } else { 166*0d0321e0SJeremy L Thompson // -- Strided restriction 167*0d0321e0SJeremy L Thompson kernel = impl->noTrStrided; 168*0d0321e0SJeremy L Thompson void *args[] = {&nelem, &d_u, &d_v}; 169*0d0321e0SJeremy L Thompson CeedInt blocksize = elemsize<256?(elemsize>64?elemsize:64):256; 170*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize), 171*0d0321e0SJeremy L Thompson blocksize, args); CeedChkBackend(ierr); 172*0d0321e0SJeremy L Thompson } 173*0d0321e0SJeremy L Thompson } else { 174*0d0321e0SJeremy L Thompson // E-vector -> L-vector 175*0d0321e0SJeremy L Thompson if (impl->d_ind) { 176*0d0321e0SJeremy L Thompson // -- Offsets provided 177*0d0321e0SJeremy L Thompson kernel = impl->trOffset; 178*0d0321e0SJeremy L Thompson void *args[] = {&impl->d_lvec_indices, &impl->d_tindices, 179*0d0321e0SJeremy L Thompson &impl->d_toffsets, &d_u, &d_v 180*0d0321e0SJeremy L Thompson }; 181*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize), 182*0d0321e0SJeremy L Thompson blocksize, args); CeedChkBackend(ierr); 183*0d0321e0SJeremy L Thompson } else { 184*0d0321e0SJeremy L Thompson // -- Strided restriction 185*0d0321e0SJeremy L Thompson kernel = impl->trStrided; 186*0d0321e0SJeremy L Thompson void *args[] = {&nelem, &d_u, &d_v}; 187*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, kernel, CeedDivUpInt(nnodes, blocksize), 188*0d0321e0SJeremy L Thompson blocksize, args); CeedChkBackend(ierr); 189*0d0321e0SJeremy L Thompson } 190*0d0321e0SJeremy L Thompson } 191*0d0321e0SJeremy L Thompson 192*0d0321e0SJeremy L Thompson if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) 193*0d0321e0SJeremy L Thompson *request = NULL; 194*0d0321e0SJeremy L Thompson 195*0d0321e0SJeremy L Thompson // Restore arrays 196*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 197*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 198*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 199*0d0321e0SJeremy L Thompson } 200*0d0321e0SJeremy L Thompson 201*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 202*0d0321e0SJeremy L Thompson // Blocked not supported 203*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 204*0d0321e0SJeremy L Thompson int CeedElemRestrictionApplyBlock_Hip(CeedElemRestriction r, CeedInt block, 205*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, CeedVector u, 206*0d0321e0SJeremy L Thompson CeedVector v, CeedRequest *request) { 207*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 208*0d0321e0SJeremy L Thompson int ierr; 209*0d0321e0SJeremy L Thompson Ceed ceed; 210*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 211*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 212*0d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 213*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 214*0d0321e0SJeremy L Thompson } 215*0d0321e0SJeremy L Thompson 216*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 217*0d0321e0SJeremy L Thompson // Get offsets 218*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 219*0d0321e0SJeremy L Thompson static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, 220*0d0321e0SJeremy L Thompson CeedMemType mtype, const CeedInt **offsets) { 221*0d0321e0SJeremy L Thompson int ierr; 222*0d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 223*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(rstr, &impl); CeedChkBackend(ierr); 224*0d0321e0SJeremy L Thompson 225*0d0321e0SJeremy L Thompson switch (mtype) { 226*0d0321e0SJeremy L Thompson case CEED_MEM_HOST: 227*0d0321e0SJeremy L Thompson *offsets = impl->h_ind; 228*0d0321e0SJeremy L Thompson break; 229*0d0321e0SJeremy L Thompson case CEED_MEM_DEVICE: 230*0d0321e0SJeremy L Thompson *offsets = impl->d_ind; 231*0d0321e0SJeremy L Thompson break; 232*0d0321e0SJeremy L Thompson } 233*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 234*0d0321e0SJeremy L Thompson } 235*0d0321e0SJeremy L Thompson 236*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 237*0d0321e0SJeremy L Thompson // Destroy restriction 238*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 239*0d0321e0SJeremy L Thompson static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { 240*0d0321e0SJeremy L Thompson int ierr; 241*0d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 242*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 243*0d0321e0SJeremy L Thompson 244*0d0321e0SJeremy L Thompson Ceed ceed; 245*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 246*0d0321e0SJeremy L Thompson ierr = hipModuleUnload(impl->module); CeedChk_Hip(ceed, ierr); 247*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl->h_ind_allocated); CeedChkBackend(ierr); 248*0d0321e0SJeremy L Thompson ierr = hipFree(impl->d_ind_allocated); CeedChk_Hip(ceed, ierr); 249*0d0321e0SJeremy L Thompson ierr = hipFree(impl->d_toffsets); CeedChk_Hip(ceed, ierr); 250*0d0321e0SJeremy L Thompson ierr = hipFree(impl->d_tindices); CeedChk_Hip(ceed, ierr); 251*0d0321e0SJeremy L Thompson ierr = hipFree(impl->d_lvec_indices); CeedChk_Hip(ceed, ierr); 252*0d0321e0SJeremy L Thompson 253*0d0321e0SJeremy L Thompson ierr = CeedFree(&impl); CeedChkBackend(ierr); 254*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 255*0d0321e0SJeremy L Thompson } 256*0d0321e0SJeremy L Thompson 257*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 258*0d0321e0SJeremy L Thompson // Create transpose offsets and indices 259*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 260*0d0321e0SJeremy L Thompson static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, 261*0d0321e0SJeremy L Thompson const CeedInt *indices) { 262*0d0321e0SJeremy L Thompson int ierr; 263*0d0321e0SJeremy L Thompson Ceed ceed; 264*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 265*0d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 266*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetData(r, &impl); CeedChkBackend(ierr); 267*0d0321e0SJeremy L Thompson CeedInt nelem, elemsize, lsize, ncomp; 268*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &nelem); CeedChkBackend(ierr); 269*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr); 270*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetLVectorSize(r, &lsize); CeedChkBackend(ierr); 271*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &ncomp); CeedChkBackend(ierr); 272*0d0321e0SJeremy L Thompson 273*0d0321e0SJeremy L Thompson // Count nnodes 274*0d0321e0SJeremy L Thompson bool *isNode; 275*0d0321e0SJeremy L Thompson ierr = CeedCalloc(lsize, &isNode); CeedChkBackend(ierr); 276*0d0321e0SJeremy L Thompson const CeedInt sizeIndices = nelem * elemsize; 277*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < sizeIndices; i++) 278*0d0321e0SJeremy L Thompson isNode[indices[i]] = 1; 279*0d0321e0SJeremy L Thompson CeedInt nnodes = 0; 280*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < lsize; i++) 281*0d0321e0SJeremy L Thompson nnodes += isNode[i]; 282*0d0321e0SJeremy L Thompson impl->nnodes = nnodes; 283*0d0321e0SJeremy L Thompson 284*0d0321e0SJeremy L Thompson // L-vector offsets array 285*0d0321e0SJeremy L Thompson CeedInt *ind_to_offset, *lvec_indices; 286*0d0321e0SJeremy L Thompson ierr = CeedCalloc(lsize, &ind_to_offset); CeedChkBackend(ierr); 287*0d0321e0SJeremy L Thompson ierr = CeedCalloc(nnodes, &lvec_indices); CeedChkBackend(ierr); 288*0d0321e0SJeremy L Thompson CeedInt j = 0; 289*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < lsize; i++) 290*0d0321e0SJeremy L Thompson if (isNode[i]) { 291*0d0321e0SJeremy L Thompson lvec_indices[j] = i; 292*0d0321e0SJeremy L Thompson ind_to_offset[i] = j++; 293*0d0321e0SJeremy L Thompson } 294*0d0321e0SJeremy L Thompson ierr = CeedFree(&isNode); CeedChkBackend(ierr); 295*0d0321e0SJeremy L Thompson 296*0d0321e0SJeremy L Thompson // Compute transpose offsets and indices 297*0d0321e0SJeremy L Thompson const CeedInt sizeOffsets = nnodes + 1; 298*0d0321e0SJeremy L Thompson CeedInt *toffsets; 299*0d0321e0SJeremy L Thompson ierr = CeedCalloc(sizeOffsets, &toffsets); CeedChkBackend(ierr); 300*0d0321e0SJeremy L Thompson CeedInt *tindices; 301*0d0321e0SJeremy L Thompson ierr = CeedMalloc(sizeIndices, &tindices); CeedChkBackend(ierr); 302*0d0321e0SJeremy L Thompson // Count node multiplicity 303*0d0321e0SJeremy L Thompson for (CeedInt e = 0; e < nelem; ++e) 304*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < elemsize; ++i) 305*0d0321e0SJeremy L Thompson ++toffsets[ind_to_offset[indices[elemsize*e + i]] + 1]; 306*0d0321e0SJeremy L Thompson // Convert to running sum 307*0d0321e0SJeremy L Thompson for (CeedInt i = 1; i < sizeOffsets; ++i) 308*0d0321e0SJeremy L Thompson toffsets[i] += toffsets[i-1]; 309*0d0321e0SJeremy L Thompson // List all E-vec indices associated with L-vec node 310*0d0321e0SJeremy L Thompson for (CeedInt e = 0; e < nelem; ++e) { 311*0d0321e0SJeremy L Thompson for (CeedInt i = 0; i < elemsize; ++i) { 312*0d0321e0SJeremy L Thompson const CeedInt lid = elemsize*e + i; 313*0d0321e0SJeremy L Thompson const CeedInt gid = indices[lid]; 314*0d0321e0SJeremy L Thompson tindices[toffsets[ind_to_offset[gid]]++] = lid; 315*0d0321e0SJeremy L Thompson } 316*0d0321e0SJeremy L Thompson } 317*0d0321e0SJeremy L Thompson // Reset running sum 318*0d0321e0SJeremy L Thompson for (int i = sizeOffsets - 1; i > 0; --i) 319*0d0321e0SJeremy L Thompson toffsets[i] = toffsets[i - 1]; 320*0d0321e0SJeremy L Thompson toffsets[0] = 0; 321*0d0321e0SJeremy L Thompson 322*0d0321e0SJeremy L Thompson // Copy data to device 323*0d0321e0SJeremy L Thompson // -- L-vector indices 324*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_lvec_indices, nnodes*sizeof(CeedInt)); 325*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 326*0d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_lvec_indices, lvec_indices, 327*0d0321e0SJeremy L Thompson nnodes*sizeof(CeedInt), hipMemcpyHostToDevice); 328*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 329*0d0321e0SJeremy L Thompson // -- Transpose offsets 330*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_toffsets, sizeOffsets*sizeof(CeedInt)); 331*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 332*0d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_toffsets, toffsets, sizeOffsets*sizeof(CeedInt), 333*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 334*0d0321e0SJeremy L Thompson // -- Transpose indices 335*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&impl->d_tindices, sizeIndices*sizeof(CeedInt)); 336*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 337*0d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_tindices, tindices, sizeIndices*sizeof(CeedInt), 338*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 339*0d0321e0SJeremy L Thompson 340*0d0321e0SJeremy L Thompson // Cleanup 341*0d0321e0SJeremy L Thompson ierr = CeedFree(&ind_to_offset); CeedChkBackend(ierr); 342*0d0321e0SJeremy L Thompson ierr = CeedFree(&lvec_indices); CeedChkBackend(ierr); 343*0d0321e0SJeremy L Thompson ierr = CeedFree(&toffsets); CeedChkBackend(ierr); 344*0d0321e0SJeremy L Thompson ierr = CeedFree(&tindices); CeedChkBackend(ierr); 345*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 346*0d0321e0SJeremy L Thompson } 347*0d0321e0SJeremy L Thompson 348*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 349*0d0321e0SJeremy L Thompson // Create restriction 350*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 351*0d0321e0SJeremy L Thompson int CeedElemRestrictionCreate_Hip(CeedMemType mtype, CeedCopyMode cmode, 352*0d0321e0SJeremy L Thompson const CeedInt *indices, 353*0d0321e0SJeremy L Thompson CeedElemRestriction r) { 354*0d0321e0SJeremy L Thompson int ierr; 355*0d0321e0SJeremy L Thompson Ceed ceed; 356*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 357*0d0321e0SJeremy L Thompson CeedElemRestriction_Hip *impl; 358*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &impl); CeedChkBackend(ierr); 359*0d0321e0SJeremy L Thompson CeedInt nelem, ncomp, elemsize; 360*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumElements(r, &nelem); CeedChkBackend(ierr); 361*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetNumComponents(r, &ncomp); CeedChkBackend(ierr); 362*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetElementSize(r, &elemsize); CeedChkBackend(ierr); 363*0d0321e0SJeremy L Thompson CeedInt size = nelem * elemsize; 364*0d0321e0SJeremy L Thompson CeedInt strides[3] = {1, size, elemsize}; 365*0d0321e0SJeremy L Thompson CeedInt compstride = 1; 366*0d0321e0SJeremy L Thompson 367*0d0321e0SJeremy L Thompson // Stride data 368*0d0321e0SJeremy L Thompson bool isStrided; 369*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionIsStrided(r, &isStrided); CeedChkBackend(ierr); 370*0d0321e0SJeremy L Thompson if (isStrided) { 371*0d0321e0SJeremy L Thompson bool backendstrides; 372*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(r, &backendstrides); 373*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 374*0d0321e0SJeremy L Thompson if (!backendstrides) { 375*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetStrides(r, &strides); CeedChkBackend(ierr); 376*0d0321e0SJeremy L Thompson } 377*0d0321e0SJeremy L Thompson } else { 378*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCompStride(r, &compstride); CeedChkBackend(ierr); 379*0d0321e0SJeremy L Thompson } 380*0d0321e0SJeremy L Thompson 381*0d0321e0SJeremy L Thompson impl->h_ind = NULL; 382*0d0321e0SJeremy L Thompson impl->h_ind_allocated = NULL; 383*0d0321e0SJeremy L Thompson impl->d_ind = NULL; 384*0d0321e0SJeremy L Thompson impl->d_ind_allocated = NULL; 385*0d0321e0SJeremy L Thompson impl->d_tindices = NULL; 386*0d0321e0SJeremy L Thompson impl->d_toffsets = NULL; 387*0d0321e0SJeremy L Thompson impl->nnodes = size; 388*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetData(r, impl); CeedChkBackend(ierr); 389*0d0321e0SJeremy L Thompson CeedInt layout[3] = {1, elemsize*nelem, elemsize}; 390*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionSetELayout(r, layout); CeedChkBackend(ierr); 391*0d0321e0SJeremy L Thompson 392*0d0321e0SJeremy L Thompson // Set up device indices/offset arrays 393*0d0321e0SJeremy L Thompson if (mtype == CEED_MEM_HOST) { 394*0d0321e0SJeremy L Thompson switch (cmode) { 395*0d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 396*0d0321e0SJeremy L Thompson impl->h_ind_allocated = (CeedInt *)indices; 397*0d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 398*0d0321e0SJeremy L Thompson break; 399*0d0321e0SJeremy L Thompson case CEED_USE_POINTER: 400*0d0321e0SJeremy L Thompson impl->h_ind = (CeedInt *)indices; 401*0d0321e0SJeremy L Thompson break; 402*0d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 403*0d0321e0SJeremy L Thompson break; 404*0d0321e0SJeremy L Thompson } 405*0d0321e0SJeremy L Thompson if (indices != NULL) { 406*0d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 407*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 408*0d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 409*0d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 410*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); 411*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 412*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 413*0d0321e0SJeremy L Thompson } 414*0d0321e0SJeremy L Thompson } else if (mtype == CEED_MEM_DEVICE) { 415*0d0321e0SJeremy L Thompson switch (cmode) { 416*0d0321e0SJeremy L Thompson case CEED_COPY_VALUES: 417*0d0321e0SJeremy L Thompson if (indices != NULL) { 418*0d0321e0SJeremy L Thompson ierr = hipMalloc( (void **)&impl->d_ind, size * sizeof(CeedInt)); 419*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 420*0d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; // We own the device memory 421*0d0321e0SJeremy L Thompson ierr = hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), 422*0d0321e0SJeremy L Thompson hipMemcpyDeviceToDevice); 423*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 424*0d0321e0SJeremy L Thompson } 425*0d0321e0SJeremy L Thompson break; 426*0d0321e0SJeremy L Thompson case CEED_OWN_POINTER: 427*0d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 428*0d0321e0SJeremy L Thompson impl->d_ind_allocated = impl->d_ind; 429*0d0321e0SJeremy L Thompson break; 430*0d0321e0SJeremy L Thompson case CEED_USE_POINTER: 431*0d0321e0SJeremy L Thompson impl->d_ind = (CeedInt *)indices; 432*0d0321e0SJeremy L Thompson } 433*0d0321e0SJeremy L Thompson if (indices != NULL) { 434*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionOffset_Hip(r, indices); CeedChkBackend(ierr); 435*0d0321e0SJeremy L Thompson } 436*0d0321e0SJeremy L Thompson } else { 437*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 438*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 439*0d0321e0SJeremy L Thompson "Only MemType = HOST or DEVICE supported"); 440*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 441*0d0321e0SJeremy L Thompson } 442*0d0321e0SJeremy L Thompson 443*0d0321e0SJeremy L Thompson // Compile HIP kernels 444*0d0321e0SJeremy L Thompson CeedInt nnodes = impl->nnodes; 445*0d0321e0SJeremy L Thompson ierr = CeedCompileHip(ceed, restrictionkernels, &impl->module, 8, 446*0d0321e0SJeremy L Thompson "RESTRICTION_ELEMSIZE", elemsize, 447*0d0321e0SJeremy L Thompson "RESTRICTION_NELEM", nelem, 448*0d0321e0SJeremy L Thompson "RESTRICTION_NCOMP", ncomp, 449*0d0321e0SJeremy L Thompson "RESTRICTION_NNODES", nnodes, 450*0d0321e0SJeremy L Thompson "RESTRICTION_COMPSTRIDE", compstride, 451*0d0321e0SJeremy L Thompson "STRIDE_NODES", strides[0], 452*0d0321e0SJeremy L Thompson "STRIDE_COMP", strides[1], 453*0d0321e0SJeremy L Thompson "STRIDE_ELEM", strides[2]); CeedChkBackend(ierr); 454*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "noTrStrided", 455*0d0321e0SJeremy L Thompson &impl->noTrStrided); CeedChkBackend(ierr); 456*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "noTrOffset", &impl->noTrOffset); 457*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 458*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "trStrided", &impl->trStrided); 459*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 460*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, impl->module, "trOffset", &impl->trOffset); 461*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 462*0d0321e0SJeremy L Thompson 463*0d0321e0SJeremy L Thompson // Register backend functions 464*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", 465*0d0321e0SJeremy L Thompson CeedElemRestrictionApply_Hip); 466*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 467*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyBlock", 468*0d0321e0SJeremy L Thompson CeedElemRestrictionApplyBlock_Hip); 469*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 470*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", 471*0d0321e0SJeremy L Thompson CeedElemRestrictionGetOffsets_Hip); 472*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 473*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", 474*0d0321e0SJeremy L Thompson CeedElemRestrictionDestroy_Hip); 475*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 476*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 477*0d0321e0SJeremy L Thompson } 478*0d0321e0SJeremy L Thompson 479*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 480*0d0321e0SJeremy L Thompson // Blocked not supported 481*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 482*0d0321e0SJeremy L Thompson int CeedElemRestrictionCreateBlocked_Hip(const CeedMemType mtype, 483*0d0321e0SJeremy L Thompson const CeedCopyMode cmode, const CeedInt *indices, CeedElemRestriction r) { 484*0d0321e0SJeremy L Thompson int ierr; 485*0d0321e0SJeremy L Thompson Ceed ceed; 486*0d0321e0SJeremy L Thompson ierr = CeedElemRestrictionGetCeed(r, &ceed); CeedChkBackend(ierr); 487*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 488*0d0321e0SJeremy L Thompson "Backend does not implement blocked restrictions"); 489*0d0321e0SJeremy L Thompson } 490*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 491