// Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights // reserved. See files LICENSE and NOTICE for details. // // This file is part of CEED, a collection of benchmarks, miniapps, software // libraries and APIs for efficient high-order finite element and spectral // element discretizations for exascale applications. For more information and // source code availability see http://github.com/ceed. // // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, // a collaborative effort of two U.S. Department of Energy organizations (Office // of Science and the National Nuclear Security Administration) responsible for // the planning and preparation of a capable exascale ecosystem, including // software, applications, hardware, advanced system engineering and early // testbed platforms, in support of the nation's exascale computing imperative. // libCEED + PETSc Example: CEED BPs // // This example demonstrates a simple usage of libCEED with PETSc to solve the // CEED BP benchmark problems, see http://ceed.exascaleproject.org/bps. // // The code is intentionally "raw", using only low-level communication // primitives. // // Build with: // // make bpsraw [PETSC_DIR=] [CEED_DIR=] // // Sample runs: // // ./bpsraw -problem bp1 // ./bpsraw -problem bp2 // ./bpsraw -problem bp3 // ./bpsraw -problem bp4 // ./bpsraw -problem bp5 -ceed /cpu/self // ./bpsraw -problem bp6 -ceed /gpu/cuda // //TESTARGS -ceed {ceed_resource} -test -problem bp2 -degree 5 -q_extra 1 -ksp_max_it_clip 15,15 /// @file /// CEED BPs example using PETSc /// See bps.c for an implementation using DMPlex unstructured grids. const char help[] = "Solve CEED BPs using PETSc\n"; #include #include #include #include #include #include "qfunctions/bps/bp1.h" #include "qfunctions/bps/bp2.h" #include "qfunctions/bps/bp3.h" #include "qfunctions/bps/bp4.h" #include "qfunctions/bps/common.h" #if PETSC_VERSION_LT(3,12,0) #ifdef PETSC_HAVE_CUDA #include // Note: With PETSc prior to version 3.12.0, providing the source path to // include 'cublas_v2.h' will be needed to use 'petsccuda.h'. #endif #endif static CeedMemType MemTypeP2C(PetscMemType mem_type) { return PetscMemTypeDevice(mem_type) ? CEED_MEM_DEVICE : CEED_MEM_HOST; } static void Split3(PetscInt size, PetscInt m[3], bool reverse) { for (PetscInt d=0, size_left=size; d<3; d++) { PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(size_left, 1./(3 - d))); while (try * (size_left / try) != size_left) try++; m[reverse ? 2-d : d] = try; size_left /= try; } } static PetscInt Max3(const PetscInt a[3]) { return PetscMax(a[0], PetscMax(a[1], a[2])); } static PetscInt Min3(const PetscInt a[3]) { return PetscMin(a[0], PetscMin(a[1], a[2])); } static void GlobalNodes(const PetscInt p[3], const PetscInt i_rank[3], PetscInt degree, const PetscInt mesh_elem[3], PetscInt m_nodes[3]) { for (int d=0; d<3; d++) m_nodes[d] = degree*mesh_elem[d] + (i_rank[d] == p[d]-1); } static PetscInt GlobalStart(const PetscInt p[3], const PetscInt i_rank[3], PetscInt degree, const PetscInt mesh_elem[3]) { PetscInt start = 0; // Dumb brute-force is easier to read for (PetscInt i=0; il_to_g, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); ierr = VecScatterEnd(user->l_to_g, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); // Setup libCEED vectors ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &x_mem_type); CHKERRQ(ierr); ierr = VecGetArrayAndMemType(user->Y_loc, &y, &y_mem_type); CHKERRQ(ierr); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); CeedVectorSetArray(user->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); // Apply libCEED operator CeedOperatorApply(user->op, user->x_ceed, user->y_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(user->y_ceed, MemTypeP2C(y_mem_type), NULL); ierr = VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x); CHKERRQ(ierr); ierr = VecRestoreArrayAndMemType(user->Y_loc, &y); CHKERRQ(ierr); // Local-to-global if (Y) { ierr = VecZeroEntries(Y); CHKERRQ(ierr); ierr = VecScatterBegin(user->l_to_g, user->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); ierr = VecScatterEnd(user->l_to_g, user->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); } PetscFunctionReturn(0); } // This function uses libCEED to compute the action of the Laplacian with // Dirichlet boundary conditions static PetscErrorCode MatMult_Diff(Mat A, Vec X, Vec Y) { PetscErrorCode ierr; User user; PetscScalar *x, *y; PetscMemType x_mem_type, y_mem_type; PetscFunctionBeginUser; ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); // Global-to-local ierr = VecScatterBegin(user->l_to_g_0, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); ierr = VecScatterEnd(user->l_to_g_0, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); // Setup libCEED vectors ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &x_mem_type); CHKERRQ(ierr); ierr = VecGetArrayAndMemType(user->Y_loc, &y, &y_mem_type); CHKERRQ(ierr); CeedVectorSetArray(user->x_ceed, MemTypeP2C(x_mem_type), CEED_USE_POINTER, x); CeedVectorSetArray(user->y_ceed, MemTypeP2C(y_mem_type), CEED_USE_POINTER, y); // Apply libCEED operator CeedOperatorApply(user->op, user->x_ceed, user->y_ceed, CEED_REQUEST_IMMEDIATE); // Restore PETSc vectors CeedVectorTakeArray(user->x_ceed, MemTypeP2C(x_mem_type), NULL); CeedVectorTakeArray(user->y_ceed, MemTypeP2C(y_mem_type), NULL); ierr = VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x); CHKERRQ(ierr); ierr = VecRestoreArrayAndMemType(user->Y_loc, &y); CHKERRQ(ierr); // Local-to-global ierr = VecZeroEntries(Y); CHKERRQ(ierr); ierr = VecScatterBegin(user->g_to_g_D, X, Y, INSERT_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); ierr = VecScatterEnd(user->g_to_g_D, X, Y, INSERT_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); ierr = VecScatterBegin(user->l_to_g_0, user->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); ierr = VecScatterEnd(user->l_to_g_0, user->Y_loc, Y, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); PetscFunctionReturn(0); } // This function calculates the error in the final solution static PetscErrorCode ComputeErrorMax(User user, CeedOperator op_error, Vec X, CeedVector target, PetscReal *max_error) { PetscErrorCode ierr; PetscScalar *x; PetscMemType mem_type; CeedVector collocated_error; CeedSize length; PetscFunctionBeginUser; CeedVectorGetLength(target, &length); CeedVectorCreate(user->ceed, length, &collocated_error); // Global-to-local ierr = VecScatterBegin(user->l_to_g, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); ierr = VecScatterEnd(user->l_to_g, X, user->X_loc, INSERT_VALUES, SCATTER_REVERSE); CHKERRQ(ierr); // Setup libCEED vector ierr = VecGetArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x, &mem_type); CHKERRQ(ierr); CeedVectorSetArray(user->x_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, x); // Apply libCEED operator CeedOperatorApply(op_error, user->x_ceed, collocated_error, CEED_REQUEST_IMMEDIATE); // Restore PETSc vector CeedVectorTakeArray(user->x_ceed, MemTypeP2C(mem_type), NULL); ierr = VecRestoreArrayReadAndMemType(user->X_loc, (const PetscScalar **)&x); CHKERRQ(ierr); // Reduce max error *max_error = 0; const CeedScalar *e; CeedVectorGetArrayRead(collocated_error, CEED_MEM_HOST, &e); for (CeedInt i=0; icomm); CHKERRQ(ierr); // Cleanup CeedVectorDestroy(&collocated_error); PetscFunctionReturn(0); } int main(int argc, char **argv) { PetscInt ierr; MPI_Comm comm; char ceed_resource[PETSC_MAX_PATH_LEN] = "/cpu/self"; double my_rt_start, my_rt, rt_min, rt_max; PetscInt degree, q_extra, local_nodes, local_elem, mesh_elem[3], m_nodes[3], p[3], i_rank[3], l_nodes[3], l_size, num_comp_u = 1, ksp_max_it_clip[2]; PetscScalar *r; PetscBool test_mode, benchmark_mode, write_solution; PetscMPIInt size, rank; PetscLogStage solve_stage; Vec X, X_loc, rhs, rhs_loc; Mat mat; KSP ksp; VecScatter l_to_g, l_to_g_0, g_to_g_D; PetscMemType mem_type; User user; Ceed ceed; CeedBasis basis_x, basis_u; CeedElemRestriction elem_restr_x, elem_restr_u, elem_restr_u_i, elem_restr_qd_i; CeedQFunction qf_setup_geo, qf_setup_rhs, qf_apply, qf_error; CeedOperator op_setup_geo, op_setup_rhs, op_apply, op_error; CeedVector x_coord, q_data, rhs_ceed, target; CeedInt P, Q; const CeedInt dim = 3, num_comp_x = 3; BPType bp_choice; ierr = PetscInitialize(&argc, &argv, NULL, help); if (ierr) return ierr; comm = PETSC_COMM_WORLD; // Read command line options ierr = PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); CHKERRQ(ierr); bp_choice = CEED_BP1; ierr = PetscOptionsEnum("-problem", "CEED benchmark problem to solve", NULL, bp_types, (PetscEnum)bp_choice, (PetscEnum *)&bp_choice, NULL); CHKERRQ(ierr); num_comp_u = bp_options[bp_choice].num_comp_u; test_mode = PETSC_FALSE; ierr = PetscOptionsBool("-test", "Testing mode (do not print unless error is large)", NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); benchmark_mode = PETSC_FALSE; ierr = PetscOptionsBool("-benchmark", "Benchmarking mode (prints benchmark statistics)", NULL, benchmark_mode, &benchmark_mode, NULL); CHKERRQ(ierr); write_solution = PETSC_FALSE; ierr = PetscOptionsBool("-write_solution", "Write solution for visualization", NULL, write_solution, &write_solution, NULL); CHKERRQ(ierr); degree = test_mode ? 3 : 1; ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", NULL, degree, °ree, NULL); CHKERRQ(ierr); q_extra = bp_options[bp_choice].q_extra; ierr = PetscOptionsInt("-q_extra", "Number of extra quadrature points", NULL, q_extra, &q_extra, NULL); CHKERRQ(ierr); ierr = PetscOptionsString("-ceed", "CEED resource specifier", NULL, ceed_resource, ceed_resource, sizeof(ceed_resource), NULL); CHKERRQ(ierr); local_nodes = 1000; ierr = PetscOptionsInt("-local", "Target number of locally owned nodes per process", NULL, local_nodes, &local_nodes, NULL); CHKERRQ(ierr); PetscInt two = 2; ksp_max_it_clip[0] = 5; ksp_max_it_clip[1] = 20; ierr = PetscOptionsIntArray("-ksp_max_it_clip", "Min and max number of iterations to use during benchmarking", NULL, ksp_max_it_clip, &two, NULL); CHKERRQ(ierr); ierr = PetscOptionsEnd(); CHKERRQ(ierr); P = degree + 1; Q = P + q_extra; // Set up libCEED CeedInit(ceed_resource, &ceed); CeedMemType mem_type_backend; CeedGetPreferredMemType(ceed, &mem_type_backend); VecType default_vec_type = NULL, vec_type; switch (mem_type_backend) { case CEED_MEM_HOST: default_vec_type = VECSTANDARD; break; case CEED_MEM_DEVICE: { const char *resolved; CeedGetResource(ceed, &resolved); if (strstr(resolved, "/gpu/cuda")) default_vec_type = VECCUDA; else if (strstr(resolved, "/gpu/hip/occa")) default_vec_type = VECSTANDARD; // https://github.com/CEED/libCEED/issues/678 else if (strstr(resolved, "/gpu/hip")) default_vec_type = VECHIP; else default_vec_type = VECSTANDARD; } } // Determine size of process grid ierr = MPI_Comm_size(comm, &size); CHKERRQ(ierr); Split3(size, p, false); // Find a nicely composite number of elements no less than local_nodes for (local_elem = PetscMax(1, local_nodes / (degree*degree*degree)); ; local_elem++) { Split3(local_elem, mesh_elem, true); if (Max3(mesh_elem) / Min3(mesh_elem) <= 2) break; } // Find my location in the process grid ierr = MPI_Comm_rank(comm, &rank); CHKERRQ(ierr); for (int d=0, rank_left=rank; d=m_nodes[0], ii=i-ir*m_nodes[0], i=m_nodes[1], jj=j-jr*m_nodes[1], j=m_nodes[2], kk=k-kr*m_nodes[2], kcomm = comm; user->l_to_g = l_to_g; if (bp_choice != CEED_BP1 && bp_choice != CEED_BP2) { user->l_to_g_0 = l_to_g_0; user->g_to_g_D = g_to_g_D; } user->X_loc = X_loc; ierr = VecDuplicate(X_loc, &user->Y_loc); CHKERRQ(ierr); CeedVectorCreate(ceed, l_size*num_comp_u, &user->x_ceed); CeedVectorCreate(ceed, l_size*num_comp_u, &user->y_ceed); user->op = op_apply; user->q_data = q_data; user->ceed = ceed; ierr = MatCreateShell(comm, m_nodes[0]*m_nodes[1]*m_nodes[2]*num_comp_u, m_nodes[0]*m_nodes[1]*m_nodes[2]*num_comp_u, PETSC_DECIDE, PETSC_DECIDE, user, &mat); CHKERRQ(ierr); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Mass); CHKERRQ(ierr); } else { ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Diff); CHKERRQ(ierr); } ierr = VecGetType(X, &vec_type); CHKERRQ(ierr); ierr = MatShellSetVecType(mat, vec_type); CHKERRQ(ierr); // Get RHS vector ierr = VecDuplicate(X, &rhs); CHKERRQ(ierr); ierr = VecDuplicate(X_loc, &rhs_loc); CHKERRQ(ierr); ierr = VecZeroEntries(rhs_loc); CHKERRQ(ierr); ierr = VecGetArrayAndMemType(rhs_loc, &r, &mem_type); CHKERRQ(ierr); CeedVectorSetArray(rhs_ceed, MemTypeP2C(mem_type), CEED_USE_POINTER, r); // Setup q_data, rhs, and target CeedOperatorApply(op_setup_geo, x_coord, q_data, CEED_REQUEST_IMMEDIATE); CeedOperatorApply(op_setup_rhs, x_coord, rhs_ceed, CEED_REQUEST_IMMEDIATE); CeedVectorDestroy(&x_coord); // Gather RHS ierr = CeedVectorTakeArray(rhs_ceed, MemTypeP2C(mem_type), NULL); CHKERRQ(ierr); ierr = VecRestoreArrayAndMemType(rhs_loc, &r); CHKERRQ(ierr); ierr = VecZeroEntries(rhs); CHKERRQ(ierr); ierr = VecScatterBegin(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); ierr = VecScatterEnd(l_to_g, rhs_loc, rhs, ADD_VALUES, SCATTER_FORWARD); CHKERRQ(ierr); CeedVectorDestroy(&rhs_ceed); ierr = KSPCreate(comm, &ksp); CHKERRQ(ierr); { PC pc; ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); if (bp_choice == CEED_BP1 || bp_choice == CEED_BP2) { ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr); } else { ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr); } ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, PETSC_DEFAULT); CHKERRQ(ierr); } ierr = KSPSetOperators(ksp, mat, mat); CHKERRQ(ierr); // First run's performance log is not considered for benchmarking purposes ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); CHKERRQ(ierr); my_rt_start = MPI_Wtime(); ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); my_rt = MPI_Wtime() - my_rt_start; ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm); CHKERRQ(ierr); // Set maxits based on first iteration timing if (my_rt > 0.02) { ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, ksp_max_it_clip[0]); CHKERRQ(ierr); } else { ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, ksp_max_it_clip[1]); CHKERRQ(ierr); } ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); // Timed solve ierr = VecZeroEntries(X); CHKERRQ(ierr); ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); // -- Performance logging ierr = PetscLogStageRegister("Solve Stage", &solve_stage); CHKERRQ(ierr); ierr = PetscLogStagePush(solve_stage); CHKERRQ(ierr); // -- Solve my_rt_start = MPI_Wtime(); ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); my_rt = MPI_Wtime() - my_rt_start; // -- Performance logging ierr = PetscLogStagePop(); // Output results { KSPType ksp_type; KSPConvergedReason reason; PetscReal rnorm; PetscInt its; ierr = KSPGetType(ksp, &ksp_type); CHKERRQ(ierr); ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); if (!test_mode || reason < 0 || rnorm > 1e-8) { ierr = PetscPrintf(comm, " KSP:\n" " KSP Type : %s\n" " KSP Convergence : %s\n" " Total KSP Iterations : %D\n" " Final rnorm : %e\n", ksp_type, KSPConvergedReasons[reason], its, (double)rnorm); CHKERRQ(ierr); } if (!test_mode) { ierr = PetscPrintf(comm," Performance:\n"); CHKERRQ(ierr); } { PetscReal max_error; ierr = ComputeErrorMax(user, op_error, X, target, &max_error); CHKERRQ(ierr); PetscReal tol = 5e-2; if (!test_mode || max_error > tol) { ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); CHKERRQ(ierr); ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); CHKERRQ(ierr); ierr = PetscPrintf(comm, " Pointwise Error (max) : %e\n" " CG Solve Time : %g (%g) sec\n", (double)max_error, rt_max, rt_min); CHKERRQ(ierr); } } if (!test_mode) { ierr = PetscPrintf(comm, " DoFs/Sec in CG : %g (%g) million\n", 1e-6*gsize*its/rt_max, 1e-6*gsize*its/rt_min); CHKERRQ(ierr); } } if (write_solution) { PetscViewer vtk_viewer_soln; ierr = PetscViewerCreate(comm, &vtk_viewer_soln); CHKERRQ(ierr); ierr = PetscViewerSetType(vtk_viewer_soln, PETSCVIEWERVTK); CHKERRQ(ierr); ierr = PetscViewerFileSetName(vtk_viewer_soln, "solution.vtu"); CHKERRQ(ierr); ierr = VecView(X, vtk_viewer_soln); CHKERRQ(ierr); ierr = PetscViewerDestroy(&vtk_viewer_soln); CHKERRQ(ierr); } ierr = VecDestroy(&rhs); CHKERRQ(ierr); ierr = VecDestroy(&rhs_loc); CHKERRQ(ierr); ierr = VecDestroy(&X); CHKERRQ(ierr); ierr = VecDestroy(&user->X_loc); CHKERRQ(ierr); ierr = VecDestroy(&user->Y_loc); CHKERRQ(ierr); ierr = VecScatterDestroy(&l_to_g); CHKERRQ(ierr); ierr = VecScatterDestroy(&l_to_g_0); CHKERRQ(ierr); ierr = VecScatterDestroy(&g_to_g_D); CHKERRQ(ierr); ierr = MatDestroy(&mat); CHKERRQ(ierr); ierr = KSPDestroy(&ksp); CHKERRQ(ierr); CeedVectorDestroy(&user->x_ceed); CeedVectorDestroy(&user->y_ceed); CeedVectorDestroy(&user->q_data); CeedVectorDestroy(&target); CeedOperatorDestroy(&op_setup_geo); CeedOperatorDestroy(&op_setup_rhs); CeedOperatorDestroy(&op_apply); CeedOperatorDestroy(&op_error); CeedElemRestrictionDestroy(&elem_restr_u); CeedElemRestrictionDestroy(&elem_restr_x); CeedElemRestrictionDestroy(&elem_restr_u_i); CeedElemRestrictionDestroy(&elem_restr_qd_i); CeedQFunctionDestroy(&qf_setup_geo); CeedQFunctionDestroy(&qf_setup_rhs); CeedQFunctionDestroy(&qf_apply); CeedQFunctionDestroy(&qf_error); CeedBasisDestroy(&basis_u); CeedBasisDestroy(&basis_x); CeedDestroy(&ceed); ierr = PetscFree(user); CHKERRQ(ierr); return PetscFinalize(); }