17d8d0e25Snbeams // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 27d8d0e25Snbeams // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 37d8d0e25Snbeams // All Rights reserved. See files LICENSE and NOTICE for details. 47d8d0e25Snbeams // 57d8d0e25Snbeams // This file is part of CEED, a collection of benchmarks, miniapps, software 67d8d0e25Snbeams // libraries and APIs for efficient high-order finite element and spectral 77d8d0e25Snbeams // element discretizations for exascale applications. For more information and 87d8d0e25Snbeams // source code availability see http://github.com/ceed. 97d8d0e25Snbeams // 107d8d0e25Snbeams // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 117d8d0e25Snbeams // a collaborative effort of two U.S. Department of Energy organizations (Office 127d8d0e25Snbeams // of Science and the National Nuclear Security Administration) responsible for 137d8d0e25Snbeams // the planning and preparation of a capable exascale ecosystem, including 147d8d0e25Snbeams // software, applications, hardware, advanced system engineering and early 157d8d0e25Snbeams // testbed platforms, in support of the nation's exascale computing imperative. 167d8d0e25Snbeams 17ec3da8bcSJed Brown #include <ceed/ceed.h> 18ec3da8bcSJed Brown #include <ceed/backend.h> 19*437930d1SJeremy L Thompson #include <ceed/jit-tools.h> 203d576824SJeremy L Thompson #include <hip/hip_runtime.h> 213d576824SJeremy L Thompson #include <stddef.h> 227d8d0e25Snbeams #include "ceed-hip-shared.h" 237fcac036SJeremy L Thompson #include "../hip/ceed-hip-common.h" 247d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 257d8d0e25Snbeams 267d8d0e25Snbeams //------------------------------------------------------------------------------ 279e31c45bSnbeams // Compute a block size based on required minimum threads 289e31c45bSnbeams //------------------------------------------------------------------------------ 299e31c45bSnbeams static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) { 309e31c45bSnbeams CeedInt maxSize = 1024; // Max total threads per block 319e31c45bSnbeams CeedInt currentSize = 64; // Start with one group 329e31c45bSnbeams 339e31c45bSnbeams while(currentSize < maxSize) { 349e31c45bSnbeams if (currentSize > required) 359e31c45bSnbeams break; 369e31c45bSnbeams else 379e31c45bSnbeams currentSize = currentSize * 2; 389e31c45bSnbeams } 399e31c45bSnbeams return currentSize; 409e31c45bSnbeams } 419e31c45bSnbeams 429e31c45bSnbeams //------------------------------------------------------------------------------ 439e31c45bSnbeams // Compute required thread block sizes for basis kernels given P, Q, dim, and 44*437930d1SJeremy L Thompson // num_comp 459e31c45bSnbeams //------------------------------------------------------------------------------ 46*437930d1SJeremy L Thompson static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, 47*437930d1SJeremy L Thompson const CeedInt Q_1d, 48*437930d1SJeremy L Thompson const CeedInt num_comp, CeedInt *block_sizes) { 499e31c45bSnbeams 509e31c45bSnbeams // Note that this will use the same block sizes for all dimensions when compiling, 519e31c45bSnbeams // but as each basis object is defined for a particular dimension, we will never 529e31c45bSnbeams // call any kernels except the ones for the dimension for which we have computed the 539e31c45bSnbeams // block sizes. 54*437930d1SJeremy L Thompson const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); 559e31c45bSnbeams switch (dim) { 569e31c45bSnbeams case 1: { 579e31c45bSnbeams // Interp kernels: 58*437930d1SJeremy L Thompson block_sizes[0] = 256; 599e31c45bSnbeams 609e31c45bSnbeams // Grad kernels: 61*437930d1SJeremy L Thompson block_sizes[1] = 256; 629e31c45bSnbeams 639e31c45bSnbeams // Weight kernels: 64*437930d1SJeremy L Thompson block_sizes[2] = 256; 659e31c45bSnbeams 669e31c45bSnbeams } break; 679e31c45bSnbeams case 2: { 689e31c45bSnbeams // Interp kernels: 69*437930d1SJeremy L Thompson CeedInt required = thread_1d * thread_1d * num_comp; 70*437930d1SJeremy L Thompson block_sizes[0] = ComputeBlockSizeFromRequirement(required); 719e31c45bSnbeams 729e31c45bSnbeams // Grad kernels: currently use same required minimum threads 73*437930d1SJeremy L Thompson block_sizes[1] = ComputeBlockSizeFromRequirement(required); 749e31c45bSnbeams 759e31c45bSnbeams // Weight kernels: 76*437930d1SJeremy L Thompson required = CeedIntMax(64, Q_1d * Q_1d); 77*437930d1SJeremy L Thompson block_sizes[2] = ComputeBlockSizeFromRequirement(required); 789e31c45bSnbeams 799e31c45bSnbeams } break; 809e31c45bSnbeams case 3: { 819e31c45bSnbeams // Interp kernels: 82*437930d1SJeremy L Thompson CeedInt required = thread_1d * thread_1d * num_comp; 83*437930d1SJeremy L Thompson block_sizes[0] = ComputeBlockSizeFromRequirement(required); 849e31c45bSnbeams 859e31c45bSnbeams // Grad kernels: currently use same required minimum threads 86*437930d1SJeremy L Thompson block_sizes[1] = ComputeBlockSizeFromRequirement(required); 879e31c45bSnbeams 889e31c45bSnbeams // Weight kernels: 89*437930d1SJeremy L Thompson required = Q_1d * Q_1d * Q_1d; 90*437930d1SJeremy L Thompson block_sizes[2] = ComputeBlockSizeFromRequirement(required); 919e31c45bSnbeams } 929e31c45bSnbeams } 939e31c45bSnbeams 94e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 959e31c45bSnbeams } 969e31c45bSnbeams 979e31c45bSnbeams //------------------------------------------------------------------------------ 987d8d0e25Snbeams // Apply basis 997d8d0e25Snbeams //------------------------------------------------------------------------------ 100*437930d1SJeremy L Thompson int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, 101*437930d1SJeremy L Thompson CeedTransposeMode t_mode, 102*437930d1SJeremy L Thompson CeedEvalMode eval_mode, CeedVector u, 1037d8d0e25Snbeams CeedVector v) { 1047d8d0e25Snbeams int ierr; 1057d8d0e25Snbeams Ceed ceed; 106e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 1076dbfb411Snbeams Ceed_Hip *ceed_Hip; 108e15f9bd0SJeremy L Thompson CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 1097d8d0e25Snbeams CeedBasis_Hip_shared *data; 110e15f9bd0SJeremy L Thompson CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 111*437930d1SJeremy L Thompson const CeedInt transpose = t_mode == CEED_TRANSPOSE; 112*437930d1SJeremy L Thompson CeedInt dim, num_comp; 113e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 114*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 1157d8d0e25Snbeams 1167d8d0e25Snbeams // Read vectors 1177d8d0e25Snbeams const CeedScalar *d_u; 1187d8d0e25Snbeams CeedScalar *d_v; 119*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 120e15f9bd0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 1217d8d0e25Snbeams } 1229c774eddSJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 1237d8d0e25Snbeams 1247d8d0e25Snbeams // Clear v for transpose mode 125*437930d1SJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 1267d8d0e25Snbeams CeedInt length; 127e15f9bd0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 128e15f9bd0SJeremy L Thompson ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChkBackend(ierr); 1297d8d0e25Snbeams } 1307d8d0e25Snbeams 1317d8d0e25Snbeams // Apply basis operation 132*437930d1SJeremy L Thompson switch (eval_mode) { 1337d8d0e25Snbeams case CEED_EVAL_INTERP: { 134*437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 135*437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[0]; 136*437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 137*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 138*437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 139*437930d1SJeremy L Thompson void *interp_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 1407d8d0e25Snbeams &d_u, &d_v 1417d8d0e25Snbeams }; 1427d8d0e25Snbeams if (dim == 1) { 143*437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 144*437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 145*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 146*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 147*437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 148*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, 149*437930d1SJeremy L Thompson elems_per_block, shared_mem, 150*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1517d8d0e25Snbeams } else if (dim == 2) { 1529e31c45bSnbeams // Check if required threads is small enough to do multiple elems 153*437930d1SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size / 154*437930d1SJeremy L Thompson (thread_1d*thread_1d*num_comp), 1); 155*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 156*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 157*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 158*437930d1SJeremy L Thompson CeedScalar); 159*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 160*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 161*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1627d8d0e25Snbeams } else if (dim == 3) { 163*437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 164*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 165*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 166*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 167*437930d1SJeremy L Thompson CeedScalar); 168*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, 169*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 170*437930d1SJeremy L Thompson interp_args); CeedChkBackend(ierr); 1717d8d0e25Snbeams } 1727d8d0e25Snbeams } break; 1737d8d0e25Snbeams case CEED_EVAL_GRAD: { 174*437930d1SJeremy L Thompson CeedInt P_1d, Q_1d; 175*437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[1]; 176*437930d1SJeremy L Thompson ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 177*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 178*437930d1SJeremy L Thompson CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 179*437930d1SJeremy L Thompson void *grad_args[] = {(void *) &num_elem, (void *) &transpose, &data->d_interp_1d, 180*437930d1SJeremy L Thompson &data->d_grad_1d, &d_u, &d_v 1817d8d0e25Snbeams }; 1827d8d0e25Snbeams if (dim == 1) { 183*437930d1SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 184*437930d1SJeremy L Thompson elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 185*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 186*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 187*437930d1SJeremy L Thompson CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar); 188*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, 189*437930d1SJeremy L Thompson elems_per_block, shared_mem, grad_args); 190e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1917d8d0e25Snbeams } else if (dim == 2) { 1929e31c45bSnbeams // Check if required threads is small enough to do multiple elems 193*437930d1SJeremy L Thompson const CeedInt elems_per_block = CeedIntMax(block_size/ 194*437930d1SJeremy L Thompson (thread_1d*thread_1d*num_comp), 1); 195*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 196*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 197*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 198*437930d1SJeremy L Thompson CeedScalar); 199*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 200*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 201*437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 2027d8d0e25Snbeams } else if (dim == 3) { 203*437930d1SJeremy L Thompson CeedInt elems_per_block = 1; 204*437930d1SJeremy L Thompson CeedInt grid = num_elem / elems_per_block + 205*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 206*437930d1SJeremy L Thompson CeedInt shared_mem = num_comp*elems_per_block*thread_1d*thread_1d*sizeof( 207*437930d1SJeremy L Thompson CeedScalar); 208*437930d1SJeremy L Thompson ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, 209*437930d1SJeremy L Thompson num_comp*elems_per_block, shared_mem, 210*437930d1SJeremy L Thompson grad_args); CeedChkBackend(ierr); 2117d8d0e25Snbeams } 2127d8d0e25Snbeams } break; 2137d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 214*437930d1SJeremy L Thompson CeedInt Q_1d; 215*437930d1SJeremy L Thompson CeedInt block_size = data->block_sizes[2]; 216*437930d1SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 217*437930d1SJeremy L Thompson void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v}; 2187d8d0e25Snbeams if (dim == 1) { 219*437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / Q_1d; 220*437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 221*437930d1SJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + 222*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 223*437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, 224*437930d1SJeremy L Thompson elems_per_block, 1, weight_args); 225e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2267d8d0e25Snbeams } else if (dim == 2) { 227*437930d1SJeremy L Thompson const CeedInt opt_elems = block_size / (Q_1d * Q_1d); 228*437930d1SJeremy L Thompson const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; 229*437930d1SJeremy L Thompson const CeedInt grid_size = num_elem / elems_per_block + 230*437930d1SJeremy L Thompson ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 ); 231*437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, 232*437930d1SJeremy L Thompson elems_per_block, weight_args); 233e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2347d8d0e25Snbeams } else if (dim == 3) { 235*437930d1SJeremy L Thompson const CeedInt grid_size = num_elem; 236*437930d1SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, Q_1d, 237*437930d1SJeremy L Thompson weight_args); 238e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2397d8d0e25Snbeams } 2407d8d0e25Snbeams } break; 2417d8d0e25Snbeams // LCOV_EXCL_START 2427d8d0e25Snbeams // Evaluate the divergence to/from the quadrature points 2437d8d0e25Snbeams case CEED_EVAL_DIV: 244e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 2457d8d0e25Snbeams // Evaluate the curl to/from the quadrature points 2467d8d0e25Snbeams case CEED_EVAL_CURL: 247e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 2487d8d0e25Snbeams // Take no action, BasisApply should not have been called 2497d8d0e25Snbeams case CEED_EVAL_NONE: 250e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 2517d8d0e25Snbeams "CEED_EVAL_NONE does not make sense in this context"); 2527d8d0e25Snbeams // LCOV_EXCL_STOP 2537d8d0e25Snbeams } 2547d8d0e25Snbeams 2557d8d0e25Snbeams // Restore vectors 256*437930d1SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 257e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 2587d8d0e25Snbeams } 259e15f9bd0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 260e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2617d8d0e25Snbeams } 2627d8d0e25Snbeams 2637d8d0e25Snbeams //------------------------------------------------------------------------------ 2647d8d0e25Snbeams // Destroy basis 2657d8d0e25Snbeams //------------------------------------------------------------------------------ 2667d8d0e25Snbeams static int CeedBasisDestroy_Hip_shared(CeedBasis basis) { 2677d8d0e25Snbeams int ierr; 2687d8d0e25Snbeams Ceed ceed; 269e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2707d8d0e25Snbeams 2717d8d0e25Snbeams CeedBasis_Hip_shared *data; 272e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 2737d8d0e25Snbeams 2747d8d0e25Snbeams CeedChk_Hip(ceed, hipModuleUnload(data->module)); 2757d8d0e25Snbeams 276*437930d1SJeremy L Thompson ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr); 277*437930d1SJeremy L Thompson ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr); 278*437930d1SJeremy L Thompson ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr); 279*437930d1SJeremy L Thompson ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr); 280e15f9bd0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 2817d8d0e25Snbeams 282e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 2837d8d0e25Snbeams } 2847d8d0e25Snbeams 2857d8d0e25Snbeams //------------------------------------------------------------------------------ 2867d8d0e25Snbeams // Create tensor basis 2877d8d0e25Snbeams //------------------------------------------------------------------------------ 288*437930d1SJeremy L Thompson int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, 289*437930d1SJeremy L Thompson const CeedScalar *interp_1d, 290*437930d1SJeremy L Thompson const CeedScalar *grad_1d, 291*437930d1SJeremy L Thompson const CeedScalar *q_ref1d, 292*437930d1SJeremy L Thompson const CeedScalar *q_weight_1d, 2937d8d0e25Snbeams CeedBasis basis) { 2947d8d0e25Snbeams int ierr; 2957d8d0e25Snbeams Ceed ceed; 296e15f9bd0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 2977d8d0e25Snbeams CeedBasis_Hip_shared *data; 298e15f9bd0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 2997d8d0e25Snbeams 3007d8d0e25Snbeams // Copy basis data to GPU 301*437930d1SJeremy L Thompson const CeedInt qBytes = Q_1d * sizeof(CeedScalar); 302*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes); 303*437930d1SJeremy L Thompson CeedChk_Hip(ceed, ierr); 304*437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, 3057d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3067d8d0e25Snbeams 307*437930d1SJeremy L Thompson const CeedInt iBytes = qBytes * P_1d; 308*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr); 309*437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes, 3107d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3117d8d0e25Snbeams 312*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr); 313*437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes, 3147d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 3157d8d0e25Snbeams 3167d8d0e25Snbeams // Compute collocated gradient and copy to GPU 317*437930d1SJeremy L Thompson data->d_collo_grad_1d = NULL; 318*437930d1SJeremy L Thompson if (dim == 3 && Q_1d >= P_1d) { 319*437930d1SJeremy L Thompson CeedScalar *collo_grad_1d; 320*437930d1SJeremy L Thompson ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr); 321*437930d1SJeremy L Thompson ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr); 322*437930d1SJeremy L Thompson ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d); 3237d8d0e25Snbeams CeedChk_Hip(ceed, ierr); 324*437930d1SJeremy L Thompson ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, 3257d8d0e25Snbeams hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 326*437930d1SJeremy L Thompson ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr); 3277d8d0e25Snbeams } 3287d8d0e25Snbeams 3299e31c45bSnbeams // Set number of threads per block for basis kernels 330*437930d1SJeremy L Thompson CeedInt num_comp; 331*437930d1SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr); 332*437930d1SJeremy L Thompson ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, 333*437930d1SJeremy L Thompson data->block_sizes); 334e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3359e31c45bSnbeams 3369e31c45bSnbeams // Compile basis kernels 337*437930d1SJeremy L Thompson char *basis_kernel_path, *basis_kernel_source; 338*437930d1SJeremy L Thompson ierr = CeedPathConcatenate(ceed, __FILE__, "kernels/hip-shared-basis.h", 339*437930d1SJeremy L Thompson &basis_kernel_path); CeedChkBackend(ierr); 340*437930d1SJeremy L Thompson ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source); 341*437930d1SJeremy L Thompson CeedChkBackend(ierr); 342*437930d1SJeremy L Thompson ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, 343*437930d1SJeremy L Thompson "Q1D", Q_1d, 344*437930d1SJeremy L Thompson "P1D", P_1d, 345*437930d1SJeremy L Thompson "T1D", CeedIntMax(Q_1d, P_1d), 346*437930d1SJeremy L Thompson "BASIS_BUF_LEN", num_comp * CeedIntPow(Q_1d > P_1d ? 347*437930d1SJeremy L Thompson Q_1d : P_1d, dim), 3487d8d0e25Snbeams "BASIS_DIM", dim, 349*437930d1SJeremy L Thompson "BASIS_NCOMP", num_comp, 350*437930d1SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P_1d, dim), 351*437930d1SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q_1d, dim), 352*437930d1SJeremy L Thompson "INTERP_BLKSIZE", data->block_sizes[0], 353*437930d1SJeremy L Thompson "GRAD_BLKSIZE", data->block_sizes[1], 354*437930d1SJeremy L Thompson "WEIGHT_BLKSIZE", data->block_sizes[2] 355e15f9bd0SJeremy L Thompson ); CeedChkBackend(ierr); 356*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp); 357e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 358*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad); 359e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 360*437930d1SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight); 361e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 362*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr); 363*437930d1SJeremy L Thompson ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr); 3647d8d0e25Snbeams 365e15f9bd0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 3667d8d0e25Snbeams 3677d8d0e25Snbeams // Register backend functions 3687d8d0e25Snbeams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 3697d8d0e25Snbeams CeedBasisApplyTensor_Hip_shared); 370e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3717d8d0e25Snbeams ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 372e15f9bd0SJeremy L Thompson CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr); 373e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 3747d8d0e25Snbeams } 3757d8d0e25Snbeams //------------------------------------------------------------------------------ 376