17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details. 47d8d0e25Snbeams // 57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software 67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral 77d8d0e25Snbeams // element discretizations for exascale applications. For more information and 87d8d0e25Snbeams // source code availability see http://github.com/ceed. 97d8d0e25Snbeams // 107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office 127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for 137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including 147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early 157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative. 167d8d0e25Snbeams 17ec3da8bcSJed Brown #include <ceed/ceed.h> 18ec3da8bcSJed Brown #include <ceed/backend.h> 19437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 203d576824SJeremy L Thompson #include <hip/hip_runtime.h> 213d576824SJeremy L Thompson #include <stddef.h> 227d8d0e25Snbeams #include "ceed-hip-shared.h" 237fcac036SJeremy L Thompson #include "../hip/ceed-hip-common.h" 247d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 257d8d0e25Snbeams 267d8d0e25Snbeams //------------------------------------------------------------------------------ 279e31c45bSnbeams // Compute a block size based on required minimum threads 289e31c45bSnbeams //------------------------------------------------------------------------------ 299e31c45bSnbeams static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { 309e31c45bSnbeams CeedInt maxSize = 1024; // Max total threads per block 319e31c45bSnbeams CeedInt currentSize = 64; // Start with one group 329e31c45bSnbeams 339e31c45bSnbeams while(currentSize < maxSize) { 349e31c45bSnbeams if (currentSize > required) 359e31c45bSnbeams break; 369e31c45bSnbeams else 379e31c45bSnbeams currentSize = currentSize * 2; 389e31c45bSnbeams } 399e31c45bSnbeams return currentSize; 409e31c45bSnbeams } 419e31c45bSnbeams 429e31c45bSnbeams //------------------------------------------------------------------------------ 439e31c45bSnbeams // Compute required thread block sizes for basis kernels given P, Q, dim, and 44437930d1SJeremy L Thompson // num_comp 459e31c45bSnbeams //------------------------------------------------------------------------------ 46437930d1SJeremy L Thompson static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, 47437930d1SJeremy L Thompson const CeedInt Q_1d, 48437930d1SJeremy L Thompson const CeedInt num_comp, CeedInt *block_sizes) { 499e31c45bSnbeams 509e31c45bSnbeams // Note that this will use the same block sizes for all dimensions when compiling, 519e31c45bSnbeams // but as each basis object is defined for a particular dimension, we will never 529e31c45bSnbeams // call any kernels except the ones for the dimension for which we have computed the 539e31c45bSnbeams // block sizes. 54437930d1SJeremy L Thompson const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); 559e31c45bSnbeams switch (dim) { 569e31c45bSnbeams case 1: { 579e31c45bSnbeams // Interp kernels: 58437930d1SJeremy L Thompson block_sizes[0] = 256; 599e31c45bSnbeams 609e31c45bSnbeams // Grad kernels: 61437930d1SJeremy L Thompson block_sizes[1] = 256; 629e31c45bSnbeams 639e31c45bSnbeams // Weight kernels: 64437930d1SJeremy L Thompson block_sizes[2] = 256; 659e31c45bSnbeams 669e31c45bSnbeams } break; 679e31c45bSnbeams case 2: { 689e31c45bSnbeams // Interp kernels: 69437930d1SJeremy L Thompson CeedInt required = thread_1d * thread_1d * num_comp; 70437930d1SJeremy L Thompson block_sizes[0] = ComputeBlockSizeFromRequirement(required); 719e31c45bSnbeams 729e31c45bSnbeams // Grad kernels: currently use same required minimum threads 73437930d1SJeremy L Thompson block_sizes[1] = ComputeBlockSizeFromRequirement(required); 749e31c45bSnbeams 759e31c45bSnbeams // Weight kernels: 76437930d1SJeremy L Thompson required = CeedIntMax(64, Q_1d * Q_1d); 77437930d1SJeremy L Thompson block_sizes[2] = ComputeBlockSizeFromRequirement(required); 789e31c45bSnbeams 799e31c45bSnbeams } break; 809e31c45bSnbeams case 3: { 819e31c45bSnbeams // Interp kernels: 82437930d1SJeremy L Thompson CeedInt required = thread_1d * thread_1d * num_comp; 83437930d1SJeremy L Thompson block_sizes[0] = ComputeBlockSizeFromRequirement(required); 849e31c45bSnbeams 859e31c45bSnbeams // Grad kernels: currently use same required minimum threads 86437930d1SJeremy L Thompson block_sizes[1] = ComputeBlockSizeFromRequirement(required); 879e31c45bSnbeams 889e31c45bSnbeams // Weight kernels: 89437930d1SJeremy L Thompson required = Q_1d * Q_1d * Q_1d; 90437930d1SJeremy L Thompson block_sizes[2] = ComputeBlockSizeFromRequirement(required); 919e31c45bSnbeams } 929e31c45bSnbeams } 939e31c45bSnbeams 94e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 959e31c45bSnbeams } 969e31c45bSnbeams 979e31c45bSnbeams //------------------------------------------------------------------------------ 987d8d0e25Snbeams // Apply basis 997d8d0e25Snbeams //------------------------------------------------------------------------------ 100437930d1SJeremy L Thompson int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, 101437930d1SJeremy L Thompson CeedTransposeMode t_mode, 102437930d1SJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, 1037d8d0e25Snbeams CeedVector v) { 1047d8d0e25Snbeams int ierr; 1057d8d0e25Snbeams Ceed ceed; 106e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 1076dbfb411Snbeams Ceed_Hip *ceed_Hip; 108e15f9bd0SJeremy L Thompson CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 1097d8d0e25Snbeams CeedBasis_Hip_shared *data; 110e15f9bd0SJeremy L Thompson CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 111437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 112437930d1SJeremy L Thompson CeedInt dim, num_comp; 113e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 114437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 1157d8d0e25Snbeams 1167d8d0e25Snbeams // Read vectors 1177d8d0e25Snbeams const CeedScalar *d_u; 1187d8d0e25Snbeams CeedScalar *d_v; 119437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 120e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 1217d8d0e25Snbeams } 1229c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 1237d8d0e25Snbeams 1247d8d0e25Snbeams // Clear v for transpose mode 125437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1267d8d0e25Snbeams CeedInt length; 127e15f9bd0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 128e15f9bd0SJeremy L Thompson ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 1297d8d0e25Snbeams } 1307d8d0e25Snbeams 1317d8d0e25Snbeams // Apply basis operation 132437930d1SJeremy L Thompson switch (eval_mode) { 1337d8d0e25Snbeams case CEED_EVAL_INTERP: { 134437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 135437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[0]; 136437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 137437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 138437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 139437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 1407d8d0e25Snbeams &d_u, &d_v 1417d8d0e25Snbeams }; 1427d8d0e25Snbeams if (dim == 1) { 143437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 144437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 145437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 146437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 147437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 148437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, 149437930d1SJeremy L Thompson elems_per_block, shared_mem, 150437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1517d8d0e25Snbeams } else if (dim == 2) { 1529e31c45bSnbeams // Check if required threads is small enough to do multiple elems 153437930d1SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / 154437930d1SJeremy L Thompson (thread_1d*thread_1d*num_comp), 1); 155437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 156437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 157437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 158437930d1SJeremy L Thompson CeedScalar); 159437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 160437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 161437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1627d8d0e25Snbeams } else if (dim == 3) { 163437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 164437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 165437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 166437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 167437930d1SJeremy L Thompson CeedScalar); 168437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 169437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 170437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1717d8d0e25Snbeams } 1727d8d0e25Snbeams } break; 1737d8d0e25Snbeams case CEED_EVAL_GRAD: { 174437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 175437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[1]; 176437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 177437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 178437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 179437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 180437930d1SJeremy L Thompson &data->d_grad_1d, &d_u, &d_v 1817d8d0e25Snbeams }; 1827d8d0e25Snbeams if (dim == 1) { 183437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 184437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 185437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 186437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 187437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 188437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, 189437930d1SJeremy L Thompson elems_per_block, shared_mem, grad_args); 190e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1917d8d0e25Snbeams } else if (dim == 2) { 1929e31c45bSnbeams // Check if required threads is small enough to do multiple elems 193437930d1SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size/ 194437930d1SJeremy L Thompson (thread_1d*thread_1d*num_comp), 1); 195437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 196437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 197437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 198437930d1SJeremy L Thompson CeedScalar); 199437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 200437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 201437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 2027d8d0e25Snbeams } else if (dim == 3) { 203437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 204437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 205437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 206437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 207437930d1SJeremy L Thompson CeedScalar); 208437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 209437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 210437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 2117d8d0e25Snbeams } 2127d8d0e25Snbeams } break; 2137d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 214437930d1SJeremy L Thompson CeedInt Q_1d; 215437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[2]; 216437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 217437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 2187d8d0e25Snbeams if (dim == 1) { 219437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / Q_1d; 220437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 221437930d1SJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + 222437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 223437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, 224437930d1SJeremy L Thompson elems_per_block, 1, weight_args); 225e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2267d8d0e25Snbeams } else if (dim == 2) { 227437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 228437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 229437930d1SJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + 230437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 231437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, 232437930d1SJeremy L Thompson elems_per_block, weight_args); 233e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2347d8d0e25Snbeams } else if (dim == 3) { 235437930d1SJeremy L Thompson const CeedInt grid_size = num_elem; 236437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, Q_1d, 237437930d1SJeremy L Thompson weight_args); 238e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2397d8d0e25Snbeams } 2407d8d0e25Snbeams } break; 2417d8d0e25Snbeams // LCOV_EXCL_START 2427d8d0e25Snbeams // Evaluate the divergence to/from the quadrature points 2437d8d0e25Snbeams case CEED_EVAL_DIV: 244e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2457d8d0e25Snbeams // Evaluate the curl to/from the quadrature points 2467d8d0e25Snbeams case CEED_EVAL_CURL: 247e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2487d8d0e25Snbeams // Take no action, BasisApply should not have been called 2497d8d0e25Snbeams case CEED_EVAL_NONE: 250e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 2517d8d0e25Snbeams "CEED_EVAL_NONE does not make sense in this context"); 2527d8d0e25Snbeams // LCOV_EXCL_STOP 2537d8d0e25Snbeams } 2547d8d0e25Snbeams 2557d8d0e25Snbeams // Restore vectors 256437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 257e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 2587d8d0e25Snbeams } 259e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 260e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2617d8d0e25Snbeams } 2627d8d0e25Snbeams 2637d8d0e25Snbeams //------------------------------------------------------------------------------ 2647d8d0e25Snbeams // Destroy basis 2657d8d0e25Snbeams //------------------------------------------------------------------------------ 2667d8d0e25Snbeams static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { 2677d8d0e25Snbeams int ierr; 2687d8d0e25Snbeams Ceed ceed; 269e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2707d8d0e25Snbeams 2717d8d0e25Snbeams CeedBasis_Hip_shared *data; 272e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2737d8d0e25Snbeams 2747d8d0e25Snbeams CeedChk_Hip(ceed, hipModuleUnload(data->module)); 2757d8d0e25Snbeams 276437930d1SJeremy L Thompson ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 277437930d1SJeremy L Thompson ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 278437930d1SJeremy L Thompson ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 279437930d1SJeremy L Thompson ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr); 280e15f9bd0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 2817d8d0e25Snbeams 282e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2837d8d0e25Snbeams } 2847d8d0e25Snbeams 2857d8d0e25Snbeams //------------------------------------------------------------------------------ 2867d8d0e25Snbeams // Create tensor basis 2877d8d0e25Snbeams //------------------------------------------------------------------------------ 288437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 289437930d1SJeremy L Thompson const CeedScalar *interp_1d, 290437930d1SJeremy L Thompson const CeedScalar *grad_1d, 291437930d1SJeremy L Thompson const CeedScalar *q_ref1d, 292437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 2937d8d0e25Snbeams CeedBasis basis) { 2947d8d0e25Snbeams int ierr; 2957d8d0e25Snbeams Ceed ceed; 296e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2977d8d0e25Snbeams CeedBasis_Hip_shared *data; 298e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 2997d8d0e25Snbeams 3007d8d0e25Snbeams // Copy basis data to GPU 301437930d1SJeremy L Thompson const CeedInt qBytes = Q_1d * sizeof(CeedScalar); 302437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes); 303437930d1SJeremy L Thompson CeedChk_Hip(ceed, ierr); 304437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, 3057d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3067d8d0e25Snbeams 307437930d1SJeremy L Thompson const CeedInt iBytes = qBytes * P_1d; 308437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr); 309437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes, 3107d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3117d8d0e25Snbeams 312437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr); 313437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes, 3147d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3157d8d0e25Snbeams 3167d8d0e25Snbeams // Compute collocated gradient and copy to GPU 317437930d1SJeremy L Thompson data->d_collo_grad_1d = NULL; 318437930d1SJeremy L Thompson if (dim == 3 && Q_1d >= P_1d) { 319437930d1SJeremy L Thompson CeedScalar *collo_grad_1d; 320437930d1SJeremy L Thompson ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 321437930d1SJeremy L Thompson ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 322437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d); 3237d8d0e25Snbeams CeedChk_Hip(ceed, ierr); 324437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, 3257d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 326437930d1SJeremy L Thompson ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 3277d8d0e25Snbeams } 3287d8d0e25Snbeams 3299e31c45bSnbeams // Set number of threads per block for basis kernels 330437930d1SJeremy L Thompson CeedInt num_comp; 331437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 332437930d1SJeremy L Thompson ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, 333437930d1SJeremy L Thompson data->block_sizes); 334e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3359e31c45bSnbeams 3369e31c45bSnbeams // Compile basis kernels 337437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 338437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-shared-basis.h", 339437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 340*46dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n"); 341437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 342437930d1SJeremy L Thompson CeedChkBackend(ierr); 343*46dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n"); 344437930d1SJeremy L Thompson ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, 345437930d1SJeremy L Thompson "Q1D", Q_1d, 346437930d1SJeremy L Thompson "P1D", P_1d, 347437930d1SJeremy L Thompson "T1D", CeedIntMax(Q_1d, P_1d), 348437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 349437930d1SJeremy L Thompson Q_1d : P_1d, dim), 3507d8d0e25Snbeams "BASIS_DIM", dim, 351437930d1SJeremy L Thompson "BASIS_NCOMP", num_comp, 352437930d1SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 353437930d1SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q_1d, dim), 354437930d1SJeremy L Thompson "INTERP_BLKSIZE", data->block_sizes[0], 355437930d1SJeremy L Thompson "GRAD_BLKSIZE", data->block_sizes[1], 356437930d1SJeremy L Thompson "WEIGHT_BLKSIZE", data->block_sizes[2] 357e15f9bd0SJeremy L Thompson ); CeedChkBackend(ierr); 358437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 359e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 360437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 361e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 362437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 363e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 364437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 365437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 3667d8d0e25Snbeams 367e15f9bd0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 3687d8d0e25Snbeams 3697d8d0e25Snbeams // Register backend functions 3707d8d0e25Snbeams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 3717d8d0e25Snbeams CeedBasisApplyTensor_Hip_shared); 372e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3737d8d0e25Snbeams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 374e15f9bd0SJeremy L Thompson CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr); 375e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3767d8d0e25Snbeams } 3777d8d0e25Snbeams //------------------------------------------------------------------------------ 378