15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37d8d0e25Snbeams // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57d8d0e25Snbeams // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 77d8d0e25Snbeams 849aac155SJeremy L Thompson #include <ceed.h> 9ec3da8bcSJed Brown #include <ceed/backend.h> 10437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 1149aac155SJeremy L Thompson #include <stdbool.h> 123d576824SJeremy L Thompson #include <stddef.h> 13111870feSJeremy L Thompson #include <string.h> 14c85e8640SSebastian Grimberg #include <hip/hip_runtime.h> 152b730f8bSJeremy L Thompson 167fcac036SJeremy L Thompson #include "../hip/ceed-hip-common.h" 177d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 182b730f8bSJeremy L Thompson #include "ceed-hip-shared.h" 197d8d0e25Snbeams 207d8d0e25Snbeams //------------------------------------------------------------------------------ 219e31c45bSnbeams // Compute a block size based on required minimum threads 229e31c45bSnbeams //------------------------------------------------------------------------------ 239e31c45bSnbeams static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { 249e31c45bSnbeams CeedInt maxSize = 1024; // Max total threads per block 259e31c45bSnbeams CeedInt currentSize = 64; // Start with one group 269e31c45bSnbeams 279e31c45bSnbeams while (currentSize < maxSize) { 282b730f8bSJeremy L Thompson if (currentSize > required) break; 292b730f8bSJeremy L Thompson else currentSize = currentSize * 2; 309e31c45bSnbeams } 319e31c45bSnbeams return currentSize; 329e31c45bSnbeams } 339e31c45bSnbeams 349e31c45bSnbeams //------------------------------------------------------------------------------ 359e31c45bSnbeams // Compute required thread block sizes for basis kernels given P, Q, dim, and 369e201c85SYohann // num_comp (num_comp not currently used, but may be again in other basis 379e201c85SYohann // parallelization options) 389e31c45bSnbeams //------------------------------------------------------------------------------ 392b730f8bSJeremy L Thompson static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) { 409e31c45bSnbeams // Note that this will use the same block sizes for all dimensions when compiling, 419e31c45bSnbeams // but as each basis object is defined for a particular dimension, we will never 429e31c45bSnbeams // call any kernels except the ones for the dimension for which we have computed the 439e31c45bSnbeams // block sizes. 44437930d1SJeremy L Thompson const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); 45b7453713SJeremy L Thompson 469e31c45bSnbeams switch (dim) { 479e31c45bSnbeams case 1: { 489e31c45bSnbeams // Interp kernels: 49437930d1SJeremy L Thompson block_sizes[0] = 256; 509e31c45bSnbeams 519e31c45bSnbeams // Grad kernels: 52437930d1SJeremy L Thompson block_sizes[1] = 256; 539e31c45bSnbeams 549e31c45bSnbeams // Weight kernels: 55437930d1SJeremy L Thompson block_sizes[2] = 256; 569e31c45bSnbeams } break; 579e31c45bSnbeams case 2: { 589e31c45bSnbeams // Interp kernels: 599e201c85SYohann CeedInt required = thread_1d * thread_1d; 60b7453713SJeremy L Thompson 619e201c85SYohann block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 629e31c45bSnbeams 639e31c45bSnbeams // Grad kernels: currently use same required minimum threads 649e201c85SYohann block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 659e31c45bSnbeams 669e31c45bSnbeams // Weight kernels: 67437930d1SJeremy L Thompson required = CeedIntMax(64, Q_1d * Q_1d); 689e201c85SYohann block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 699e31c45bSnbeams 709e31c45bSnbeams } break; 719e31c45bSnbeams case 3: { 729e31c45bSnbeams // Interp kernels: 739e201c85SYohann CeedInt required = thread_1d * thread_1d; 74b7453713SJeremy L Thompson 759e201c85SYohann block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 769e31c45bSnbeams 779e31c45bSnbeams // Grad kernels: currently use same required minimum threads 789e201c85SYohann block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 799e31c45bSnbeams 809e31c45bSnbeams // Weight kernels: 81437930d1SJeremy L Thompson required = Q_1d * Q_1d * Q_1d; 829e201c85SYohann block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required)); 839e31c45bSnbeams } 849e31c45bSnbeams } 85e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 869e31c45bSnbeams } 879e31c45bSnbeams 889e31c45bSnbeams //------------------------------------------------------------------------------ 897d8d0e25Snbeams // Apply basis 907d8d0e25Snbeams //------------------------------------------------------------------------------ 91db2becc9SJeremy L Thompson static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 92db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 937d8d0e25Snbeams Ceed ceed; 946dbfb411Snbeams Ceed_Hip *ceed_Hip; 95437930d1SJeremy L Thompson CeedInt dim, num_comp; 96b7453713SJeremy L Thompson const CeedScalar *d_u; 97b7453713SJeremy L Thompson CeedScalar *d_v; 98b7453713SJeremy L Thompson CeedBasis_Hip_shared *data; 99b7453713SJeremy L Thompson 100b7453713SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 101b7453713SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); 102b7453713SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 1032b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 1042b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 1057d8d0e25Snbeams 1069ea2cfd9SJeremy L Thompson // Get read/write access to u, v 1076574a04fSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 1086574a04fSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 109759e0bc3SJeremy L Thompson if (apply_add) { 110759e0bc3SJeremy L Thompson CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 111759e0bc3SJeremy L Thompson } else { 112759e0bc3SJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 113759e0bc3SJeremy L Thompson } 1147d8d0e25Snbeams 1157d8d0e25Snbeams // Apply basis operation 116437930d1SJeremy L Thompson switch (eval_mode) { 1177d8d0e25Snbeams case CEED_EVAL_INTERP: { 118437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 119437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[0]; 120b7453713SJeremy L Thompson 1214cbc44e0SJeremy L Thompson CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]); 1222b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 1232b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 124437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 1252b730f8bSJeremy L Thompson void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 126b7453713SJeremy L Thompson 1277d8d0e25Snbeams if (dim == 1) { 128437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 129437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 130a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 131437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 132b2165e7aSSebastian Grimberg 1339e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 134e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, 135db2becc9SJeremy L Thompson elems_per_block, shared_mem, interp_args)); 1369e201c85SYohann } else { 137e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 1389e201c85SYohann } 1397d8d0e25Snbeams } else if (dim == 2) { 1409e31c45bSnbeams // Check if required threads is small enough to do multiple elems 1412b730f8bSJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 142a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 1432b730f8bSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 144b2165e7aSSebastian Grimberg 1459e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 146e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 147e9c76bddSJeremy L Thompson thread_1d, elems_per_block, shared_mem, interp_args)); 1489e201c85SYohann } else { 149e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 1509e201c85SYohann } 1517d8d0e25Snbeams } else if (dim == 3) { 1522b730f8bSJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 153a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 1542b730f8bSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 155b2165e7aSSebastian Grimberg 1569e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 157e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 158e9c76bddSJeremy L Thompson thread_1d, elems_per_block, shared_mem, interp_args)); 1599e201c85SYohann } else { 160e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 1619e201c85SYohann } 1627d8d0e25Snbeams } 1637d8d0e25Snbeams } break; 1647d8d0e25Snbeams case CEED_EVAL_GRAD: { 165437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 166437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[1]; 167b7453713SJeremy L Thompson 1684cbc44e0SJeremy L Thompson CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]); 1692b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 1702b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 171437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 1729e201c85SYohann CeedScalar *d_grad_1d = data->d_grad_1d; 173b7453713SJeremy L Thompson 1749e201c85SYohann if (data->d_collo_grad_1d) { 1759e201c85SYohann d_grad_1d = data->d_collo_grad_1d; 1769e201c85SYohann } 1772b730f8bSJeremy L Thompson void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v}; 178aa4002adSJeremy L Thompson 1797d8d0e25Snbeams if (dim == 1) { 180437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 181437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 182a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 183437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 184b2165e7aSSebastian Grimberg 1859e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 186e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1, 187db2becc9SJeremy L Thompson elems_per_block, shared_mem, grad_args)); 1889e201c85SYohann } else { 189e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 1909e201c85SYohann } 1917d8d0e25Snbeams } else if (dim == 2) { 1929e31c45bSnbeams // Check if required threads is small enough to do multiple elems 1932b730f8bSJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 194a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 1952b730f8bSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 196b2165e7aSSebastian Grimberg 1979e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 198e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d, 199db2becc9SJeremy L Thompson elems_per_block, shared_mem, grad_args)); 2009e201c85SYohann } else { 201e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 2029e201c85SYohann } 2037d8d0e25Snbeams } else if (dim == 3) { 2042b730f8bSJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 205a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 2062b730f8bSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 207b2165e7aSSebastian Grimberg 2089e201c85SYohann if (t_mode == CEED_TRANSPOSE) { 209e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, thread_1d, 210db2becc9SJeremy L Thompson elems_per_block, shared_mem, grad_args)); 2119e201c85SYohann } else { 212e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 2139e201c85SYohann } 2147d8d0e25Snbeams } 2157d8d0e25Snbeams } break; 2167d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 217437930d1SJeremy L Thompson CeedInt Q_1d; 218437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[2]; 219b7453713SJeremy L Thompson 220097cc795SJames Wright CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]); 2212b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 222437930d1SJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 223b7453713SJeremy L Thompson 2247d8d0e25Snbeams if (dim == 1) { 225437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / Q_1d; 226437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 227a8d440fbSJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 228b2165e7aSSebastian Grimberg 229eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); 2307d8d0e25Snbeams } else if (dim == 2) { 231437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 232437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 233a8d440fbSJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 234b2165e7aSSebastian Grimberg 235eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 2367d8d0e25Snbeams } else if (dim == 3) { 2379e201c85SYohann const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 2389e201c85SYohann const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 239a8d440fbSJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 240b2165e7aSSebastian Grimberg 241eb7e6cafSJeremy L Thompson CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); 2427d8d0e25Snbeams } 2437d8d0e25Snbeams } break; 2449ea2cfd9SJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 2459ea2cfd9SJeremy L Thompson break; 2467d8d0e25Snbeams // LCOV_EXCL_START 2477d8d0e25Snbeams case CEED_EVAL_DIV: 2487d8d0e25Snbeams case CEED_EVAL_CURL: 249bcbe1c99SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 2507d8d0e25Snbeams // LCOV_EXCL_STOP 2517d8d0e25Snbeams } 2527d8d0e25Snbeams 2539ea2cfd9SJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 2542b730f8bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 2559ea2cfd9SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 2569ea2cfd9SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 2579bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 258e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2597d8d0e25Snbeams } 2607d8d0e25Snbeams 261db2becc9SJeremy L Thompson int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 262db2becc9SJeremy L Thompson CeedVector v) { 263db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 264db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 265db2becc9SJeremy L Thompson } 266db2becc9SJeremy L Thompson 267db2becc9SJeremy L Thompson int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 268db2becc9SJeremy L Thompson CeedVector v) { 269db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 270db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 271db2becc9SJeremy L Thompson } 272db2becc9SJeremy L Thompson 2737d8d0e25Snbeams //------------------------------------------------------------------------------ 2741dda9c1aSJeremy L Thompson // Basis apply - tensor AtPoints 2751dda9c1aSJeremy L Thompson //------------------------------------------------------------------------------ 276db2becc9SJeremy L Thompson static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, 277db2becc9SJeremy L Thompson CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 2781dda9c1aSJeremy L Thompson Ceed ceed; 2791dda9c1aSJeremy L Thompson CeedInt Q_1d, dim, max_num_points = num_points[0]; 2801dda9c1aSJeremy L Thompson const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; 2811dda9c1aSJeremy L Thompson const CeedScalar *d_x, *d_u; 2821dda9c1aSJeremy L Thompson CeedScalar *d_v; 2831dda9c1aSJeremy L Thompson CeedBasis_Hip_shared *data; 2841dda9c1aSJeremy L Thompson 2851dda9c1aSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 2861dda9c1aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 2871dda9c1aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 2881dda9c1aSJeremy L Thompson 2891dda9c1aSJeremy L Thompson // Weight handled separately 2901dda9c1aSJeremy L Thompson if (eval_mode == CEED_EVAL_WEIGHT) { 2915a5594ffSJeremy L Thompson CeedCallBackend(CeedVectorSetValue(v, 1.0)); 2921dda9c1aSJeremy L Thompson return CEED_ERROR_SUCCESS; 2931dda9c1aSJeremy L Thompson } 2941dda9c1aSJeremy L Thompson 2959bc66399SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 2969bc66399SJeremy L Thompson 297111870feSJeremy L Thompson // Check padded to uniform number of points per elem 298111870feSJeremy L Thompson for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); 299111870feSJeremy L Thompson { 300111870feSJeremy L Thompson CeedInt num_comp, q_comp; 301111870feSJeremy L Thompson CeedSize len, len_required; 302111870feSJeremy L Thompson 303111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 304111870feSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); 305111870feSJeremy L Thompson CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); 306111870feSJeremy L Thompson len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; 307111870feSJeremy L Thompson CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND, 308111870feSJeremy L Thompson "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends." 309111870feSJeremy L Thompson " Found %" CeedSize_FMT ", Required %" CeedSize_FMT, 310111870feSJeremy L Thompson len, len_required); 311111870feSJeremy L Thompson } 312111870feSJeremy L Thompson 313111870feSJeremy L Thompson // Move num_points array to device 314111870feSJeremy L Thompson if (is_transpose) { 315111870feSJeremy L Thompson const CeedInt num_bytes = num_elem * sizeof(CeedInt); 316111870feSJeremy L Thompson 317111870feSJeremy L Thompson if (num_elem != data->num_elem_at_points) { 318111870feSJeremy L Thompson data->num_elem_at_points = num_elem; 319111870feSJeremy L Thompson 320111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 321111870feSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes)); 322111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 323111870feSJeremy L Thompson CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem)); 324111870feSJeremy L Thompson } 3259e511c80SJeremy L Thompson if (memcmp(data->h_points_per_elem, num_points, num_bytes)) { 326111870feSJeremy L Thompson memcpy(data->h_points_per_elem, num_points, num_bytes); 327111870feSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice)); 328111870feSJeremy L Thompson } 329111870feSJeremy L Thompson } 330111870feSJeremy L Thompson 3311dda9c1aSJeremy L Thompson // Build kernels if needed 3321dda9c1aSJeremy L Thompson if (data->num_points != max_num_points) { 3331dda9c1aSJeremy L Thompson CeedInt P_1d; 3341dda9c1aSJeremy L Thompson 3351dda9c1aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 3361dda9c1aSJeremy L Thompson data->num_points = max_num_points; 3371dda9c1aSJeremy L Thompson 3381dda9c1aSJeremy L Thompson // -- Create interp matrix to Chebyshev coefficients 3391dda9c1aSJeremy L Thompson if (!data->d_chebyshev_interp_1d) { 3401dda9c1aSJeremy L Thompson CeedSize interp_bytes; 3411dda9c1aSJeremy L Thompson CeedScalar *chebyshev_interp_1d; 3421dda9c1aSJeremy L Thompson 3431dda9c1aSJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 3441dda9c1aSJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 3455a5594ffSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 3461dda9c1aSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes)); 3471dda9c1aSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 3481dda9c1aSJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 3491dda9c1aSJeremy L Thompson } 3501dda9c1aSJeremy L Thompson 3511dda9c1aSJeremy L Thompson // -- Compile kernels 3529e1d4b82SJeremy L Thompson const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n"; 3531dda9c1aSJeremy L Thompson CeedInt num_comp; 3541dda9c1aSJeremy L Thompson 3551dda9c1aSJeremy L Thompson if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 3561dda9c1aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 357*6b92dc4bSJeremy L Thompson CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", 3589e1d4b82SJeremy L Thompson CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 3599e1d4b82SJeremy L Thompson "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE", 3609e1d4b82SJeremy L Thompson data->block_sizes[0])); 3611dda9c1aSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); 36281ae6159SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); 363a8772291SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAddAtPoints", &data->InterpTransposeAddAtPoints)); 3641dda9c1aSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); 36581ae6159SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints)); 366a8772291SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAddAtPoints", &data->GradTransposeAddAtPoints)); 3671dda9c1aSJeremy L Thompson } 3681dda9c1aSJeremy L Thompson 3691dda9c1aSJeremy L Thompson // Get read/write access to u, v 3701dda9c1aSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x)); 3711dda9c1aSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 3721dda9c1aSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 37311ac676fSZach Atkins if (apply_add) { 37411ac676fSZach Atkins CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 37511ac676fSZach Atkins } else { 37611ac676fSZach Atkins CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 3771dda9c1aSJeremy L Thompson } 3781dda9c1aSJeremy L Thompson 3791dda9c1aSJeremy L Thompson // Basis action 3801dda9c1aSJeremy L Thompson switch (eval_mode) { 3811dda9c1aSJeremy L Thompson case CEED_EVAL_INTERP: { 3829e1d4b82SJeremy L Thompson CeedInt P_1d, Q_1d; 3839e1d4b82SJeremy L Thompson CeedInt block_size = data->block_sizes[0]; 3841dda9c1aSJeremy L Thompson 3859e1d4b82SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 3869e1d4b82SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 3879e1d4b82SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 3889e1d4b82SJeremy L Thompson void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 3899e1d4b82SJeremy L Thompson 3909e1d4b82SJeremy L Thompson if (dim == 1) { 3919e1d4b82SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 3929e1d4b82SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 393a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 3949e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 3959e1d4b82SJeremy L Thompson 396af0e6e89SJeremy L Thompson if (is_transpose) { 397e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 398af0e6e89SJeremy L Thompson thread_1d, 1, elems_per_block, shared_mem, interp_args)); 399af0e6e89SJeremy L Thompson } else { 400e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args)); 401af0e6e89SJeremy L Thompson } 4029e1d4b82SJeremy L Thompson } else if (dim == 2) { 4039e1d4b82SJeremy L Thompson // Check if required threads is small enough to do multiple elems 4049e1d4b82SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 405a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 4069e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 4079e1d4b82SJeremy L Thompson 408af0e6e89SJeremy L Thompson if (is_transpose) { 409e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 410af0e6e89SJeremy L Thompson thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 411af0e6e89SJeremy L Thompson } else { 412af0e6e89SJeremy L Thompson CeedCallBackend( 413e9c76bddSJeremy L Thompson CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 414af0e6e89SJeremy L Thompson } 4159e1d4b82SJeremy L Thompson } else if (dim == 3) { 416b4280a96SJeremy L Thompson const CeedInt elems_per_block = 1; 417a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 4189e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 4199e1d4b82SJeremy L Thompson 420af0e6e89SJeremy L Thompson if (is_transpose) { 421e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAddAtPoints : data->InterpTransposeAtPoints, NULL, grid, 422af0e6e89SJeremy L Thompson thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 423af0e6e89SJeremy L Thompson } else { 424af0e6e89SJeremy L Thompson CeedCallBackend( 425e9c76bddSJeremy L Thompson CeedRunKernelDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); 426af0e6e89SJeremy L Thompson } 4279e1d4b82SJeremy L Thompson } 4281dda9c1aSJeremy L Thompson } break; 4291dda9c1aSJeremy L Thompson case CEED_EVAL_GRAD: { 4309e1d4b82SJeremy L Thompson CeedInt P_1d, Q_1d; 4319e1d4b82SJeremy L Thompson CeedInt block_size = data->block_sizes[0]; 4321dda9c1aSJeremy L Thompson 4339e1d4b82SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 4349e1d4b82SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 4359e1d4b82SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 4369e1d4b82SJeremy L Thompson void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; 4379e1d4b82SJeremy L Thompson 4389e1d4b82SJeremy L Thompson if (dim == 1) { 4399e1d4b82SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 4409e1d4b82SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 441a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 4429e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); 4439e1d4b82SJeremy L Thompson 444af0e6e89SJeremy L Thompson if (is_transpose) { 445e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 446e9c76bddSJeremy L Thompson thread_1d, 1, elems_per_block, shared_mem, grad_args)); 447af0e6e89SJeremy L Thompson } else { 448e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args)); 449af0e6e89SJeremy L Thompson } 4509e1d4b82SJeremy L Thompson } else if (dim == 2) { 4519e1d4b82SJeremy L Thompson // Check if required threads is small enough to do multiple elems 4529e1d4b82SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); 453a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 4549e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 4559e1d4b82SJeremy L Thompson 456af0e6e89SJeremy L Thompson if (is_transpose) { 457e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 458e9c76bddSJeremy L Thompson thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 459af0e6e89SJeremy L Thompson } else { 460e9c76bddSJeremy L Thompson CeedCallBackend( 461e9c76bddSJeremy L Thompson CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 462af0e6e89SJeremy L Thompson } 4639e1d4b82SJeremy L Thompson } else if (dim == 3) { 464b4280a96SJeremy L Thompson const CeedInt elems_per_block = 1; 465a8d440fbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 4669e1d4b82SJeremy L Thompson CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); 4679e1d4b82SJeremy L Thompson 468af0e6e89SJeremy L Thompson if (is_transpose) { 469e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAddAtPoints : data->GradTransposeAtPoints, NULL, grid, 470e9c76bddSJeremy L Thompson thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 471af0e6e89SJeremy L Thompson } else { 472e9c76bddSJeremy L Thompson CeedCallBackend( 473e9c76bddSJeremy L Thompson CeedRunKernelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); 474af0e6e89SJeremy L Thompson } 4759e1d4b82SJeremy L Thompson } 4761dda9c1aSJeremy L Thompson } break; 4771dda9c1aSJeremy L Thompson case CEED_EVAL_WEIGHT: 4781dda9c1aSJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 4791dda9c1aSJeremy L Thompson break; 4801dda9c1aSJeremy L Thompson // LCOV_EXCL_START 4811dda9c1aSJeremy L Thompson case CEED_EVAL_DIV: 4821dda9c1aSJeremy L Thompson case CEED_EVAL_CURL: 4831dda9c1aSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 4841dda9c1aSJeremy L Thompson // LCOV_EXCL_STOP 4851dda9c1aSJeremy L Thompson } 4861dda9c1aSJeremy L Thompson 4871dda9c1aSJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 4881dda9c1aSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x)); 4891dda9c1aSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 4901dda9c1aSJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 4911dda9c1aSJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 4921dda9c1aSJeremy L Thompson return CEED_ERROR_SUCCESS; 4931dda9c1aSJeremy L Thompson } 4941dda9c1aSJeremy L Thompson 495db2becc9SJeremy L Thompson static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 496db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 497db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 498db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 499db2becc9SJeremy L Thompson } 500db2becc9SJeremy L Thompson 501db2becc9SJeremy L Thompson static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, 502db2becc9SJeremy L Thompson CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { 503db2becc9SJeremy L Thompson CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v)); 504db2becc9SJeremy L Thompson return CEED_ERROR_SUCCESS; 505db2becc9SJeremy L Thompson } 506db2becc9SJeremy L Thompson 5071dda9c1aSJeremy L Thompson //------------------------------------------------------------------------------ 5086c13bbcbSJeremy L Thompson // Apply basis 5096c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 5106c13bbcbSJeremy L Thompson static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, 5116c13bbcbSJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, CeedVector v) { 5126c13bbcbSJeremy L Thompson Ceed ceed; 5136c13bbcbSJeremy L Thompson Ceed_Hip *ceed_Hip; 5146c13bbcbSJeremy L Thompson CeedInt dim, num_comp; 5156c13bbcbSJeremy L Thompson const CeedScalar *d_u; 5166c13bbcbSJeremy L Thompson CeedScalar *d_v; 5176c13bbcbSJeremy L Thompson CeedBasis_Hip_shared *data; 5186c13bbcbSJeremy L Thompson 5196c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 5206c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); 5216c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 5226c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 5236c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 5246c13bbcbSJeremy L Thompson 5256c13bbcbSJeremy L Thompson // Get read/write access to u, v 5266c13bbcbSJeremy L Thompson if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u)); 5276c13bbcbSJeremy L Thompson else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode"); 52811ac676fSZach Atkins if (apply_add) { 52911ac676fSZach Atkins CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v)); 53011ac676fSZach Atkins } else { 53111ac676fSZach Atkins CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); 53211ac676fSZach Atkins } 5336c13bbcbSJeremy L Thompson 5346c13bbcbSJeremy L Thompson // Apply basis operation 5356c13bbcbSJeremy L Thompson switch (eval_mode) { 5366c13bbcbSJeremy L Thompson case CEED_EVAL_INTERP: { 5376c13bbcbSJeremy L Thompson CeedInt P, Q; 5386c13bbcbSJeremy L Thompson 5394cbc44e0SJeremy L Thompson CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]); 5406c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 5416c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 5426c13bbcbSJeremy L Thompson CeedInt thread = CeedIntMax(Q, P); 5436c13bbcbSJeremy L Thompson void *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v}; 5446c13bbcbSJeremy L Thompson 5456c13bbcbSJeremy L Thompson { 5466c13bbcbSJeremy L Thompson CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; 5476c13bbcbSJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 5486c13bbcbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 5496c13bbcbSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 5506c13bbcbSJeremy L Thompson 5516c13bbcbSJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 552e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, 5536c13bbcbSJeremy L Thompson elems_per_block, shared_mem, interp_args)); 5546c13bbcbSJeremy L Thompson } else { 555e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, interp_args)); 5566c13bbcbSJeremy L Thompson } 5576c13bbcbSJeremy L Thompson } 5586c13bbcbSJeremy L Thompson } break; 5596c13bbcbSJeremy L Thompson case CEED_EVAL_GRAD: { 5606c13bbcbSJeremy L Thompson CeedInt P, Q; 5616c13bbcbSJeremy L Thompson 5624cbc44e0SJeremy L Thompson CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]); 5636c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 5646c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 5656c13bbcbSJeremy L Thompson CeedInt thread = CeedIntMax(Q, P); 5662d217acfSJeremy L Thompson void *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v}; 5676c13bbcbSJeremy L Thompson 5686c13bbcbSJeremy L Thompson { 5696c13bbcbSJeremy L Thompson CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; 5706c13bbcbSJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 5716c13bbcbSJeremy L Thompson CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 5726c13bbcbSJeremy L Thompson CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); 5736c13bbcbSJeremy L Thompson 5746c13bbcbSJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 575e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, 576e9c76bddSJeremy L Thompson elems_per_block, shared_mem, grad_args)); 5776c13bbcbSJeremy L Thompson } else { 578e9c76bddSJeremy L Thompson CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, grad_args)); 5796c13bbcbSJeremy L Thompson } 5806c13bbcbSJeremy L Thompson } 5816c13bbcbSJeremy L Thompson } break; 5826c13bbcbSJeremy L Thompson case CEED_EVAL_WEIGHT: { 58397011eabSZach Atkins CeedInt P, Q; 5846c13bbcbSJeremy L Thompson 5854cbc44e0SJeremy L Thompson CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]); 58697011eabSZach Atkins CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 5872d217acfSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); 58897011eabSZach Atkins CeedInt thread = CeedIntMax(Q, P); 5896c13bbcbSJeremy L Thompson void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; 5906c13bbcbSJeremy L Thompson 5916c13bbcbSJeremy L Thompson { 59297011eabSZach Atkins CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; 59397011eabSZach Atkins elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 5946c13bbcbSJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); 5956c13bbcbSJeremy L Thompson 59697011eabSZach Atkins CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args)); 5976c13bbcbSJeremy L Thompson } 5986c13bbcbSJeremy L Thompson } break; 5996c13bbcbSJeremy L Thompson case CEED_EVAL_NONE: /* handled separately below */ 6006c13bbcbSJeremy L Thompson break; 6016c13bbcbSJeremy L Thompson // LCOV_EXCL_START 6026c13bbcbSJeremy L Thompson case CEED_EVAL_DIV: 6036c13bbcbSJeremy L Thompson case CEED_EVAL_CURL: 6046c13bbcbSJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 6056c13bbcbSJeremy L Thompson // LCOV_EXCL_STOP 6066c13bbcbSJeremy L Thompson } 6076c13bbcbSJeremy L Thompson 6086c13bbcbSJeremy L Thompson // Restore vectors, cover CEED_EVAL_NONE 6096c13bbcbSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &d_v)); 6106c13bbcbSJeremy L Thompson if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u)); 6116c13bbcbSJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u)); 6126c13bbcbSJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 6136c13bbcbSJeremy L Thompson return CEED_ERROR_SUCCESS; 6146c13bbcbSJeremy L Thompson } 6156c13bbcbSJeremy L Thompson 6166c13bbcbSJeremy L Thompson int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 6176c13bbcbSJeremy L Thompson CeedVector v) { 6186c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v)); 6196c13bbcbSJeremy L Thompson return CEED_ERROR_SUCCESS; 6206c13bbcbSJeremy L Thompson } 6216c13bbcbSJeremy L Thompson 6226c13bbcbSJeremy L Thompson int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, 6236c13bbcbSJeremy L Thompson CeedVector v) { 6246c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v)); 6256c13bbcbSJeremy L Thompson return CEED_ERROR_SUCCESS; 6266c13bbcbSJeremy L Thompson } 6276c13bbcbSJeremy L Thompson 6286c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 6297d8d0e25Snbeams // Destroy basis 6307d8d0e25Snbeams //------------------------------------------------------------------------------ 6317d8d0e25Snbeams static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { 6327d8d0e25Snbeams Ceed ceed; 6337d8d0e25Snbeams CeedBasis_Hip_shared *data; 634b7453713SJeremy L Thompson 635b7453713SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 6362b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &data)); 6372b730f8bSJeremy L Thompson CeedCallHip(ceed, hipModuleUnload(data->module)); 6381dda9c1aSJeremy L Thompson if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); 639097cc795SJames Wright if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d)); 640111870feSJeremy L Thompson CeedCallBackend(CeedFree(&data->h_points_per_elem)); 641111870feSJeremy L Thompson if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem)); 6422b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(data->d_interp_1d)); 6432b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(data->d_grad_1d)); 6442b730f8bSJeremy L Thompson CeedCallHip(ceed, hipFree(data->d_collo_grad_1d)); 6451dda9c1aSJeremy L Thompson CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d)); 6462b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&data)); 647e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 6487d8d0e25Snbeams } 6497d8d0e25Snbeams 6507d8d0e25Snbeams //------------------------------------------------------------------------------ 6517d8d0e25Snbeams // Create tensor basis 6527d8d0e25Snbeams //------------------------------------------------------------------------------ 6532b730f8bSJeremy L Thompson int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, 6546574a04fSJeremy L Thompson const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) { 6557d8d0e25Snbeams Ceed ceed; 656b7453713SJeremy L Thompson CeedInt num_comp; 657b7453713SJeremy L Thompson const CeedInt q_bytes = Q_1d * sizeof(CeedScalar); 658397164e9SSebastian Grimberg const CeedInt interp_bytes = q_bytes * P_1d; 6597d8d0e25Snbeams CeedBasis_Hip_shared *data; 660b7453713SJeremy L Thompson 661b7453713SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 6622b730f8bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 6637d8d0e25Snbeams 6647d8d0e25Snbeams // Copy basis data to GPU 665097cc795SJames Wright if (q_weight_1d) { 666b7453713SJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 667b7453713SJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice)); 668097cc795SJames Wright } 669397164e9SSebastian Grimberg CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 670397164e9SSebastian Grimberg CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice)); 671397164e9SSebastian Grimberg CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes)); 672397164e9SSebastian Grimberg CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice)); 6737d8d0e25Snbeams 6747d8d0e25Snbeams // Compute collocated gradient and copy to GPU 675437930d1SJeremy L Thompson data->d_collo_grad_1d = NULL; 6769e201c85SYohann bool has_collocated_grad = dim == 3 && Q_1d >= P_1d; 677b7453713SJeremy L Thompson 6789e201c85SYohann if (has_collocated_grad) { 679437930d1SJeremy L Thompson CeedScalar *collo_grad_1d; 680b7453713SJeremy L Thompson 6812b730f8bSJeremy L Thompson CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d)); 6822b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d)); 683b7453713SJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d)); 684b7453713SJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, hipMemcpyHostToDevice)); 6852b730f8bSJeremy L Thompson CeedCallBackend(CeedFree(&collo_grad_1d)); 6867d8d0e25Snbeams } 6877d8d0e25Snbeams 6889e31c45bSnbeams // Set number of threads per block for basis kernels 6892b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 6902b730f8bSJeremy L Thompson CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes)); 6919e31c45bSnbeams 6929e31c45bSnbeams // Compile basis kernels 6939c25dd66SJeremy L Thompson const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n"; 6949c25dd66SJeremy L Thompson 695*6b92dc4bSJeremy L Thompson CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_T_1D", 696eb7e6cafSJeremy L Thompson CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), 697eb7e6cafSJeremy L Thompson "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE", 6982b730f8bSJeremy L Thompson data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD", 6992b730f8bSJeremy L Thompson has_collocated_grad)); 700eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 701eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 702db2becc9SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 703eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 704eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose)); 705db2becc9SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 706eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 7077d8d0e25Snbeams 7082b730f8bSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 7097d8d0e25Snbeams 7107d8d0e25Snbeams // Register backend functions 7112b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared)); 712db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared)); 7131dda9c1aSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared)); 714db2becc9SJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared)); 7152b730f8bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); 7169bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 717e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7187d8d0e25Snbeams } 7192a86cc9dSSebastian Grimberg 7207d8d0e25Snbeams //------------------------------------------------------------------------------ 7216c13bbcbSJeremy L Thompson // Create non-tensor basis 7226c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 7236c13bbcbSJeremy L Thompson int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, 7246c13bbcbSJeremy L Thompson const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { 7256c13bbcbSJeremy L Thompson Ceed ceed; 7266c13bbcbSJeremy L Thompson CeedInt num_comp, q_comp_interp, q_comp_grad; 7276c13bbcbSJeremy L Thompson const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); 7286c13bbcbSJeremy L Thompson CeedBasis_Hip_shared *data; 7296c13bbcbSJeremy L Thompson 7306c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); 731fda26546SJeremy L Thompson 732fda26546SJeremy L Thompson // Check shared memory size 733fda26546SJeremy L Thompson { 734fda26546SJeremy L Thompson Ceed_Hip *hip_data; 735fda26546SJeremy L Thompson 736fda26546SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &hip_data)); 737fda26546SJeremy L Thompson if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) > 738fda26546SJeremy L Thompson hip_data->device_prop.sharedMemPerBlock) { 739fda26546SJeremy L Thompson CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis)); 740fda26546SJeremy L Thompson return CEED_ERROR_SUCCESS; 741fda26546SJeremy L Thompson } 742fda26546SJeremy L Thompson } 743fda26546SJeremy L Thompson 7446c13bbcbSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &data)); 7456c13bbcbSJeremy L Thompson 7466c13bbcbSJeremy L Thompson // Copy basis data to GPU 7476c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); 7486c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); 7496c13bbcbSJeremy L Thompson if (q_weight) { 7506c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes)); 7516c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice)); 7526c13bbcbSJeremy L Thompson } 7536c13bbcbSJeremy L Thompson if (interp) { 7546c13bbcbSJeremy L Thompson const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; 7556c13bbcbSJeremy L Thompson 7566c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes)); 7576c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice)); 7586c13bbcbSJeremy L Thompson } 7596c13bbcbSJeremy L Thompson if (grad) { 7606c13bbcbSJeremy L Thompson const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; 7616c13bbcbSJeremy L Thompson 7626c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes)); 7636c13bbcbSJeremy L Thompson CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice)); 7646c13bbcbSJeremy L Thompson } 7656c13bbcbSJeremy L Thompson 7666c13bbcbSJeremy L Thompson // Compile basis kernels 7676c13bbcbSJeremy L Thompson const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n"; 7686c13bbcbSJeremy L Thompson 7696c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); 7702d217acfSJeremy L Thompson CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes)); 771*6b92dc4bSJeremy L Thompson CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_T_1D", 7722d217acfSJeremy L Thompson CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE", 7732d217acfSJeremy L Thompson data->block_sizes[0])); 7746c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); 7756c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); 7766c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd)); 7776c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); 7786c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose)); 7796c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd)); 7806c13bbcbSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); 7816c13bbcbSJeremy L Thompson 7826c13bbcbSJeremy L Thompson CeedCallBackend(CeedBasisSetData(basis, data)); 7836c13bbcbSJeremy L Thompson 7846c13bbcbSJeremy L Thompson // Register backend functions 7856c13bbcbSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared)); 7866c13bbcbSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared)); 7876c13bbcbSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared)); 7886c13bbcbSJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 7896c13bbcbSJeremy L Thompson return CEED_ERROR_SUCCESS; 7906c13bbcbSJeremy L Thompson } 7916c13bbcbSJeremy L Thompson 7926c13bbcbSJeremy L Thompson //------------------------------------------------------------------------------ 793