1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at 2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights 3 // reserved. See files LICENSE and NOTICE for details. 4 // 5 // This file is part of CEED, a collection of benchmarks, miniapps, software 6 // libraries and APIs for efficient high-order finite element and spectral 7 // element discretizations for exascale applications. For more information and 8 // source code availability see http://github.com/ceed. 9 // 10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11 // a collaborative effort of two U.S. Department of Energy organizations (Office 12 // of Science and the National Nuclear Security Administration) responsible for 13 // the planning and preparation of a capable exascale ecosystem, including 14 // software, applications, hardware, advanced system engineering and early 15 // testbed platforms, in support of the nation's exascale computing imperative. 16 17 // libCEED + PETSc Example: CEED BPs 18 // 19 // This example demonstrates a simple usage of libCEED with PETSc to solve the 20 // CEED BP benchmark problems, see http://ceed.exascaleproject.org/bps. 21 // 22 // The code is intentionally "raw", using only low-level communication 23 // primitives. 24 // 25 // Build with: 26 // 27 // make bpsraw [PETSC_DIR=</path/to/petsc>] [CEED_DIR=</path/to/libceed>] 28 // 29 // Sample runs: 30 // 31 // ./bpsraw -problem bp1 32 // ./bpsraw -problem bp2 -ceed /cpu/self 33 // ./bpsraw -problem bp3 -ceed /gpu/occa 34 // ./bpsraw -problem bp4 -ceed /cpu/occa 35 // ./bpsraw -problem bp5 -ceed /omp/occa 36 // ./bpsraw -problem bp6 -ceed /ocl/occa 37 // 38 //TESTARGS -ceed {ceed_resource} -test -problem bp2 -degree 5 -qextra 5 39 40 /// @file 41 /// CEED BPs example using PETSc 42 /// See bps.c for an implementation using DMPlex unstructured grids. 43 const char help[] = "Solve CEED BPs using PETSc\n"; 44 45 #include <stdbool.h> 46 #include <string.h> 47 #include <petscksp.h> 48 #include <ceed.h> 49 #include "qfunctions/bps/common.h" 50 #include "qfunctions/bps/bp1.h" 51 #include "qfunctions/bps/bp2.h" 52 #include "qfunctions/bps/bp3.h" 53 #include "qfunctions/bps/bp4.h" 54 55 static void Split3(PetscInt size, PetscInt m[3], bool reverse) { 56 for (PetscInt d=0,sizeleft=size; d<3; d++) { 57 PetscInt try = (PetscInt)PetscCeilReal(PetscPowReal(sizeleft, 1./(3 - d))); 58 while (try * (sizeleft / try) != sizeleft) try++; 59 m[reverse ? 2-d : d] = try; 60 sizeleft /= try; 61 } 62 } 63 64 static PetscInt Max3(const PetscInt a[3]) { 65 return PetscMax(a[0], PetscMax(a[1], a[2])); 66 } 67 static PetscInt Min3(const PetscInt a[3]) { 68 return PetscMin(a[0], PetscMin(a[1], a[2])); 69 } 70 static void GlobalNodes(const PetscInt p[3], const PetscInt irank[3], 71 PetscInt degree, const PetscInt melem[3], 72 PetscInt mnodes[3]) { 73 for (int d=0; d<3; d++) 74 mnodes[d] = degree*melem[d] + (irank[d] == p[d]-1); 75 } 76 static PetscInt GlobalStart(const PetscInt p[3], const PetscInt irank[3], 77 PetscInt degree, const PetscInt melem[3]) { 78 PetscInt start = 0; 79 // Dumb brute-force is easier to read 80 for (PetscInt i=0; i<p[0]; i++) { 81 for (PetscInt j=0; j<p[1]; j++) { 82 for (PetscInt k=0; k<p[2]; k++) { 83 PetscInt mnodes[3], ijkrank[] = {i,j,k}; 84 if (i == irank[0] && j == irank[1] && k == irank[2]) return start; 85 GlobalNodes(p, ijkrank, degree, melem, mnodes); 86 start += mnodes[0] * mnodes[1] * mnodes[2]; 87 } 88 } 89 } 90 return -1; 91 } 92 static int CreateRestriction(Ceed ceed, CeedInterlaceMode imode, 93 const CeedInt melem[3], CeedInt P, CeedInt ncomp, 94 CeedElemRestriction *Erestrict) { 95 const PetscInt nelem = melem[0]*melem[1]*melem[2]; 96 PetscInt mnodes[3], *idx, *idxp; 97 98 // Get indicies 99 for (int d=0; d<3; d++) mnodes[d] = melem[d]*(P-1) + 1; 100 idxp = idx = malloc(nelem*P*P*P*sizeof idx[0]); 101 for (CeedInt i=0; i<melem[0]; i++) 102 for (CeedInt j=0; j<melem[1]; j++) 103 for (CeedInt k=0; k<melem[2]; k++,idxp += P*P*P) 104 for (CeedInt ii=0; ii<P; ii++) 105 for (CeedInt jj=0; jj<P; jj++) 106 for (CeedInt kk=0; kk<P; kk++) { 107 if (0) { // This is the C-style (i,j,k) ordering that I prefer 108 idxp[(ii*P+jj)*P+kk] = (((i*(P-1)+ii)*mnodes[1] 109 + (j*(P-1)+jj))*mnodes[2] 110 + (k*(P-1)+kk)); 111 } else { // (k,j,i) ordering for consistency with MFEM example 112 idxp[ii+P*(jj+P*kk)] = (((i*(P-1)+ii)*mnodes[1] 113 + (j*(P-1)+jj))*mnodes[2] 114 + (k*(P-1)+kk)); 115 } 116 } 117 118 // Setup CEED restriction 119 CeedElemRestrictionCreate(ceed, imode, nelem, P*P*P, 120 mnodes[0]*mnodes[1]*mnodes[2], ncomp, 121 CEED_MEM_HOST, CEED_OWN_POINTER, idx, Erestrict); 122 123 PetscFunctionReturn(0); 124 } 125 126 // Data for PETSc 127 typedef struct User_ *User; 128 struct User_ { 129 MPI_Comm comm; 130 VecScatter ltog; // Scatter for all entries 131 VecScatter ltog0; // Skip Dirichlet values 132 VecScatter gtogD; // global-to-global; only Dirichlet values 133 Vec Xloc, Yloc; 134 CeedVector xceed, yceed; 135 CeedOperator op; 136 CeedVector qdata; 137 Ceed ceed; 138 }; 139 140 // BP Options 141 typedef enum { 142 CEED_BP1 = 0, CEED_BP2 = 1, CEED_BP3 = 2, 143 CEED_BP4 = 3, CEED_BP5 = 4, CEED_BP6 = 5 144 } bpType; 145 static const char *const bpTypes[] = {"bp1","bp2","bp3","bp4","bp5","bp6", 146 "bpType","CEED_BP",0 147 }; 148 149 // BP specific data 150 typedef struct { 151 CeedInt ncompu, qdatasize, qextra; 152 CeedQFunctionUser setupgeo, setuprhs, apply, error; 153 const char *setupgeofname, *setuprhsfname, *applyfname, *errorfname; 154 CeedEvalMode inmode, outmode; 155 CeedQuadMode qmode; 156 } bpData; 157 158 bpData bpOptions[6] = { 159 [CEED_BP1] = { 160 .ncompu = 1, 161 .qdatasize = 1, 162 .qextra = 1, 163 .setupgeo = SetupMassGeo, 164 .setuprhs = SetupMassRhs, 165 .apply = Mass, 166 .error = Error, 167 .setupgeofname = SetupMassGeo_loc, 168 .setuprhsfname = SetupMassRhs_loc, 169 .applyfname = Mass_loc, 170 .errorfname = Error_loc, 171 .inmode = CEED_EVAL_INTERP, 172 .outmode = CEED_EVAL_INTERP, 173 .qmode = CEED_GAUSS 174 }, 175 [CEED_BP2] = { 176 .ncompu = 3, 177 .qdatasize = 1, 178 .qextra = 1, 179 .setupgeo = SetupMassGeo, 180 .setuprhs = SetupMassRhs3, 181 .apply = Mass3, 182 .error = Error3, 183 .setupgeofname = SetupMassGeo_loc, 184 .setuprhsfname = SetupMassRhs3_loc, 185 .applyfname = Mass3_loc, 186 .errorfname = Error3_loc, 187 .inmode = CEED_EVAL_INTERP, 188 .outmode = CEED_EVAL_INTERP, 189 .qmode = CEED_GAUSS 190 }, 191 [CEED_BP3] = { 192 .ncompu = 1, 193 .qdatasize = 6, 194 .qextra = 1, 195 .setupgeo = SetupDiffGeo, 196 .setuprhs = SetupDiffRhs, 197 .apply = Diff, 198 .error = Error, 199 .setupgeofname = SetupDiffGeo_loc, 200 .setuprhsfname = SetupDiffRhs_loc, 201 .applyfname = Diff_loc, 202 .errorfname = Error_loc, 203 .inmode = CEED_EVAL_GRAD, 204 .outmode = CEED_EVAL_GRAD, 205 .qmode = CEED_GAUSS 206 }, 207 [CEED_BP4] = { 208 .ncompu = 3, 209 .qdatasize = 6, 210 .qextra = 1, 211 .setupgeo = SetupDiffGeo, 212 .setuprhs = SetupDiffRhs3, 213 .apply = Diff3, 214 .error = Error3, 215 .setupgeofname = SetupDiffGeo_loc, 216 .setuprhsfname = SetupDiffRhs3_loc, 217 .applyfname = Diff_loc, 218 .errorfname = Error3_loc, 219 .inmode = CEED_EVAL_GRAD, 220 .outmode = CEED_EVAL_GRAD, 221 .qmode = CEED_GAUSS 222 }, 223 [CEED_BP5] = { 224 .ncompu = 1, 225 .qdatasize = 6, 226 .qextra = 0, 227 .setupgeo = SetupDiffGeo, 228 .setuprhs = SetupDiffRhs, 229 .apply = Diff, 230 .error = Error, 231 .setupgeofname = SetupDiffGeo_loc, 232 .setuprhsfname = SetupDiffRhs_loc, 233 .applyfname = Diff_loc, 234 .errorfname = Error_loc, 235 .inmode = CEED_EVAL_GRAD, 236 .outmode = CEED_EVAL_GRAD, 237 .qmode = CEED_GAUSS_LOBATTO 238 }, 239 [CEED_BP6] = { 240 .ncompu = 3, 241 .qdatasize = 6, 242 .qextra = 0, 243 .setupgeo = SetupDiffGeo, 244 .setuprhs = SetupDiffRhs3, 245 .apply = Diff3, 246 .error = Error3, 247 .setupgeofname = SetupDiffGeo_loc, 248 .setuprhsfname = SetupDiffRhs3_loc, 249 .applyfname = Diff_loc, 250 .errorfname = Error3_loc, 251 .inmode = CEED_EVAL_GRAD, 252 .outmode = CEED_EVAL_GRAD, 253 .qmode = CEED_GAUSS_LOBATTO 254 } 255 }; 256 257 // This function uses libCEED to compute the action of the mass matrix 258 static PetscErrorCode MatMult_Mass(Mat A, Vec X, Vec Y) { 259 PetscErrorCode ierr; 260 User user; 261 PetscScalar *x, *y; 262 263 PetscFunctionBeginUser; 264 ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); 265 ierr = VecScatterBegin(user->ltog, X, user->Xloc, INSERT_VALUES, 266 SCATTER_REVERSE); CHKERRQ(ierr); 267 ierr = VecScatterEnd(user->ltog, X, user->Xloc, INSERT_VALUES, SCATTER_REVERSE); 268 CHKERRQ(ierr); 269 ierr = VecZeroEntries(user->Yloc); CHKERRQ(ierr); 270 271 ierr = VecGetArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 272 ierr = VecGetArray(user->Yloc, &y); CHKERRQ(ierr); 273 CeedVectorSetArray(user->xceed, CEED_MEM_HOST, CEED_USE_POINTER, x); 274 CeedVectorSetArray(user->yceed, CEED_MEM_HOST, CEED_USE_POINTER, y); 275 276 CeedOperatorApply(user->op, user->xceed, user->yceed, 277 CEED_REQUEST_IMMEDIATE); 278 ierr = CeedVectorSyncArray(user->yceed, CEED_MEM_HOST); CHKERRQ(ierr); 279 280 ierr = VecRestoreArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 281 ierr = VecRestoreArray(user->Yloc, &y); CHKERRQ(ierr); 282 283 if (Y) { 284 ierr = VecZeroEntries(Y); CHKERRQ(ierr); 285 ierr = VecScatterBegin(user->ltog, user->Yloc, Y, ADD_VALUES, SCATTER_FORWARD); 286 CHKERRQ(ierr); 287 ierr = VecScatterEnd(user->ltog, user->Yloc, Y, ADD_VALUES, SCATTER_FORWARD); 288 CHKERRQ(ierr); 289 } 290 PetscFunctionReturn(0); 291 } 292 293 // This function uses libCEED to compute the action of the Laplacian with 294 // Dirichlet boundary conditions 295 static PetscErrorCode MatMult_Diff(Mat A, Vec X, Vec Y) { 296 PetscErrorCode ierr; 297 User user; 298 PetscScalar *x, *y; 299 300 PetscFunctionBeginUser; 301 ierr = MatShellGetContext(A, &user); CHKERRQ(ierr); 302 303 // Global-to-local 304 ierr = VecScatterBegin(user->ltog0, X, user->Xloc, INSERT_VALUES, 305 SCATTER_REVERSE); CHKERRQ(ierr); 306 ierr = VecScatterEnd(user->ltog0, X, user->Xloc, INSERT_VALUES, 307 SCATTER_REVERSE); 308 CHKERRQ(ierr); 309 ierr = VecZeroEntries(user->Yloc); CHKERRQ(ierr); 310 311 // Setup CEED vectors 312 ierr = VecGetArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 313 ierr = VecGetArray(user->Yloc, &y); CHKERRQ(ierr); 314 CeedVectorSetArray(user->xceed, CEED_MEM_HOST, CEED_USE_POINTER, x); 315 CeedVectorSetArray(user->yceed, CEED_MEM_HOST, CEED_USE_POINTER, y); 316 317 // Apply CEED operator 318 CeedOperatorApply(user->op, user->xceed, user->yceed, 319 CEED_REQUEST_IMMEDIATE); 320 ierr = CeedVectorSyncArray(user->yceed, CEED_MEM_HOST); CHKERRQ(ierr); 321 322 // Restore PETSc vectors 323 ierr = VecRestoreArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 324 ierr = VecRestoreArray(user->Yloc, &y); CHKERRQ(ierr); 325 326 // Local-to-global 327 ierr = VecZeroEntries(Y); CHKERRQ(ierr); 328 ierr = VecScatterBegin(user->gtogD, X, Y, INSERT_VALUES, SCATTER_FORWARD); 329 CHKERRQ(ierr); 330 ierr = VecScatterEnd(user->gtogD, X, Y, INSERT_VALUES, SCATTER_FORWARD); 331 CHKERRQ(ierr); 332 ierr = VecScatterBegin(user->ltog0, user->Yloc, Y, ADD_VALUES, SCATTER_FORWARD); 333 CHKERRQ(ierr); 334 ierr = VecScatterEnd(user->ltog0, user->Yloc, Y, ADD_VALUES, SCATTER_FORWARD); 335 CHKERRQ(ierr); 336 337 PetscFunctionReturn(0); 338 } 339 340 // This function calculates the error in the final solution 341 static PetscErrorCode ComputeErrorMax(User user, CeedOperator operror, Vec X, 342 CeedVector target, PetscReal *maxerror) { 343 PetscErrorCode ierr; 344 PetscScalar *x; 345 CeedVector collocated_error; 346 CeedInt length; 347 348 PetscFunctionBeginUser; 349 CeedVectorGetLength(target, &length); 350 CeedVectorCreate(user->ceed, length, &collocated_error); 351 352 // Global-to-local 353 ierr = VecScatterBegin(user->ltog, X, user->Xloc, INSERT_VALUES, 354 SCATTER_REVERSE); CHKERRQ(ierr); 355 ierr = VecScatterEnd(user->ltog, X, user->Xloc, INSERT_VALUES, SCATTER_REVERSE); 356 CHKERRQ(ierr); 357 358 // Setup CEED vector 359 ierr = VecGetArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 360 CeedVectorSetArray(user->xceed, CEED_MEM_HOST, CEED_USE_POINTER, x); 361 362 // Apply CEED operator 363 CeedOperatorApply(operror, user->xceed, collocated_error, 364 CEED_REQUEST_IMMEDIATE); 365 366 // Restore PETSc vector 367 VecRestoreArrayRead(user->Xloc, (const PetscScalar **)&x); CHKERRQ(ierr); 368 369 // Reduce max error 370 *maxerror = 0; 371 const CeedScalar *e; 372 CeedVectorGetArrayRead(collocated_error, CEED_MEM_HOST, &e); 373 for (CeedInt i=0; i<length; i++) { 374 *maxerror = PetscMax(*maxerror, PetscAbsScalar(e[i])); 375 } 376 CeedVectorRestoreArrayRead(collocated_error, &e); 377 ierr = MPI_Allreduce(MPI_IN_PLACE, maxerror, 378 1, MPIU_REAL, MPIU_MAX, user->comm); CHKERRQ(ierr); 379 380 // Cleanup 381 CeedVectorDestroy(&collocated_error); 382 383 PetscFunctionReturn(0); 384 } 385 386 int main(int argc, char **argv) { 387 PetscInt ierr; 388 MPI_Comm comm; 389 char ceedresource[PETSC_MAX_PATH_LEN] = "/cpu/self"; 390 double my_rt_start, my_rt, rt_min, rt_max; 391 PetscInt degree, qextra, localnodes, localelem, melem[3], mnodes[3], p[3], 392 irank[3], lnodes[3], lsize, ncompu = 1; 393 PetscScalar *r; 394 PetscBool test_mode, benchmark_mode, write_solution; 395 PetscMPIInt size, rank; 396 Vec X, Xloc, rhs, rhsloc; 397 Mat mat; 398 KSP ksp; 399 VecScatter ltog, ltog0, gtogD; 400 User user; 401 Ceed ceed; 402 CeedBasis basisx, basisu; 403 CeedElemRestriction Erestrictx, Erestrictu, Erestrictui, Erestrictqdi; 404 CeedQFunction qfsetupgeo, qfsetuprhs, qfapply, qferror; 405 CeedOperator opsetupgeo, opsetuprhs, opapply, operror; 406 CeedVector xcoord, qdata, rhsceed, target; 407 CeedInt P, Q; 408 const CeedInt dim = 3, ncompx = 3; 409 bpType bpchoice; 410 411 ierr = PetscInitialize(&argc, &argv, NULL, help); 412 if (ierr) return ierr; 413 comm = PETSC_COMM_WORLD; 414 415 // Read command line options 416 ierr = PetscOptionsBegin(comm, NULL, "CEED BPs in PETSc", NULL); CHKERRQ(ierr); 417 bpchoice = CEED_BP1; 418 ierr = PetscOptionsEnum("-problem", 419 "CEED benchmark problem to solve", NULL, 420 bpTypes, (PetscEnum)bpchoice, (PetscEnum *)&bpchoice, 421 NULL); CHKERRQ(ierr); 422 ncompu = bpOptions[bpchoice].ncompu; 423 test_mode = PETSC_FALSE; 424 ierr = PetscOptionsBool("-test", 425 "Testing mode (do not print unless error is large)", 426 NULL, test_mode, &test_mode, NULL); CHKERRQ(ierr); 427 benchmark_mode = PETSC_FALSE; 428 ierr = PetscOptionsBool("-benchmark", 429 "Benchmarking mode (prints benchmark statistics)", 430 NULL, benchmark_mode, &benchmark_mode, NULL); 431 CHKERRQ(ierr); 432 write_solution = PETSC_FALSE; 433 ierr = PetscOptionsBool("-write_solution", 434 "Write solution for visualization", 435 NULL, write_solution, &write_solution, NULL); 436 CHKERRQ(ierr); 437 degree = test_mode ? 3 : 1; 438 ierr = PetscOptionsInt("-degree", "Polynomial degree of tensor product basis", 439 NULL, degree, °ree, NULL); CHKERRQ(ierr); 440 qextra = bpOptions[bpchoice].qextra; 441 ierr = PetscOptionsInt("-qextra", "Number of extra quadrature points", 442 NULL, qextra, &qextra, NULL); CHKERRQ(ierr); 443 ierr = PetscOptionsString("-ceed", "CEED resource specifier", 444 NULL, ceedresource, ceedresource, 445 sizeof(ceedresource), NULL); CHKERRQ(ierr); 446 localnodes = 1000; 447 ierr = PetscOptionsInt("-local", 448 "Target number of locally owned nodes per process", 449 NULL, localnodes, &localnodes, NULL); CHKERRQ(ierr); 450 ierr = PetscOptionsEnd(); CHKERRQ(ierr); 451 P = degree + 1; 452 Q = P + qextra; 453 454 // Determine size of process grid 455 ierr = MPI_Comm_size(comm, &size); CHKERRQ(ierr); 456 Split3(size, p, false); 457 458 // Find a nicely composite number of elements no less than localnodes 459 for (localelem = PetscMax(1, localnodes / (degree*degree*degree)); ; 460 localelem++) { 461 Split3(localelem, melem, true); 462 if (Max3(melem) / Min3(melem) <= 2) break; 463 } 464 465 // Find my location in the process grid 466 ierr = MPI_Comm_rank(comm, &rank); CHKERRQ(ierr); 467 for (int d=0,rankleft=rank; d<dim; d++) { 468 const int pstride[3] = {p[1] *p[2], p[2], 1}; 469 irank[d] = rankleft / pstride[d]; 470 rankleft -= irank[d] * pstride[d]; 471 } 472 473 GlobalNodes(p, irank, degree, melem, mnodes); 474 475 // Setup global vector 476 ierr = VecCreate(comm, &X); CHKERRQ(ierr); 477 ierr = VecSetSizes(X, mnodes[0]*mnodes[1]*mnodes[2]*ncompu, PETSC_DECIDE); 478 CHKERRQ(ierr); 479 ierr = VecSetUp(X); CHKERRQ(ierr); 480 481 // Set up libCEED 482 CeedInit(ceedresource, &ceed); 483 484 // Print summary 485 CeedInt gsize; 486 ierr = VecGetSize(X, &gsize); CHKERRQ(ierr); 487 if (!test_mode) { 488 const char *usedresource; 489 CeedGetResource(ceed, &usedresource); 490 ierr = PetscPrintf(comm, 491 "\n-- CEED Benchmark Problem %d -- libCEED + PETSc --\n" 492 " libCEED:\n" 493 " libCEED Backend : %s\n" 494 " Mesh:\n" 495 " Number of 1D Basis Nodes (p) : %d\n" 496 " Number of 1D Quadrature Points (q) : %d\n" 497 " Global nodes : %D\n" 498 " Process Decomposition : %D %D %D\n" 499 " Local Elements : %D = %D %D %D\n" 500 " Owned nodes : %D = %D %D %D\n" 501 " DoF per node : %D\n", 502 bpchoice+1, usedresource, P, Q, gsize/ncompu, p[0], 503 p[1], p[2], localelem, melem[0], melem[1], melem[2], 504 mnodes[0]*mnodes[1]*mnodes[2], mnodes[0], mnodes[1], 505 mnodes[2], ncompu); CHKERRQ(ierr); 506 } 507 508 { 509 lsize = 1; 510 for (int d=0; d<dim; d++) { 511 lnodes[d] = melem[d]*degree + 1; 512 lsize *= lnodes[d]; 513 } 514 ierr = VecCreate(PETSC_COMM_SELF, &Xloc); CHKERRQ(ierr); 515 ierr = VecSetSizes(Xloc, lsize*ncompu, PETSC_DECIDE); CHKERRQ(ierr); 516 ierr = VecSetUp(Xloc); CHKERRQ(ierr); 517 518 // Create local-to-global scatter 519 PetscInt *ltogind, *ltogind0, *locind, l0count; 520 IS ltogis, ltogis0, locis; 521 PetscInt gstart[2][2][2], gmnodes[2][2][2][dim]; 522 523 for (int i=0; i<2; i++) { 524 for (int j=0; j<2; j++) { 525 for (int k=0; k<2; k++) { 526 PetscInt ijkrank[3] = {irank[0]+i, irank[1]+j, irank[2]+k}; 527 gstart[i][j][k] = GlobalStart(p, ijkrank, degree, melem); 528 GlobalNodes(p, ijkrank, degree, melem, gmnodes[i][j][k]); 529 } 530 } 531 } 532 533 ierr = PetscMalloc1(lsize, <ogind); CHKERRQ(ierr); 534 ierr = PetscMalloc1(lsize, <ogind0); CHKERRQ(ierr); 535 ierr = PetscMalloc1(lsize, &locind); CHKERRQ(ierr); 536 l0count = 0; 537 for (PetscInt i=0,ir,ii; ir=i>=mnodes[0], ii=i-ir*mnodes[0], i<lnodes[0]; i++) 538 for (PetscInt j=0,jr,jj; jr=j>=mnodes[1], jj=j-jr*mnodes[1], j<lnodes[1]; j++) 539 for (PetscInt k=0,kr,kk; kr=k>=mnodes[2], kk=k-kr*mnodes[2], k<lnodes[2]; k++) { 540 PetscInt here = (i*lnodes[1]+j)*lnodes[2]+k; 541 ltogind[here] = 542 gstart[ir][jr][kr] + (ii*gmnodes[ir][jr][kr][1]+jj)*gmnodes[ir][jr][kr][2]+kk; 543 if ((irank[0] == 0 && i == 0) 544 || (irank[1] == 0 && j == 0) 545 || (irank[2] == 0 && k == 0) 546 || (irank[0]+1 == p[0] && i+1 == lnodes[0]) 547 || (irank[1]+1 == p[1] && j+1 == lnodes[1]) 548 || (irank[2]+1 == p[2] && k+1 == lnodes[2])) 549 continue; 550 ltogind0[l0count] = ltogind[here]; 551 locind[l0count++] = here; 552 } 553 ierr = ISCreateBlock(comm, ncompu, lsize, ltogind, PETSC_OWN_POINTER, 554 <ogis); CHKERRQ(ierr); 555 ierr = VecScatterCreate(Xloc, NULL, X, ltogis, <og); CHKERRQ(ierr); 556 CHKERRQ(ierr); 557 ierr = ISCreateBlock(comm, ncompu, l0count, ltogind0, PETSC_OWN_POINTER, 558 <ogis0); CHKERRQ(ierr); 559 ierr = ISCreateBlock(comm, ncompu, l0count, locind, PETSC_OWN_POINTER, 560 &locis); CHKERRQ(ierr); 561 ierr = VecScatterCreate(Xloc, locis, X, ltogis0, <og0); CHKERRQ(ierr); 562 { 563 // Create global-to-global scatter for Dirichlet values (everything not in 564 // ltogis0, which is the range of ltog0) 565 PetscInt xstart, xend, *indD, countD = 0; 566 IS isD; 567 const PetscScalar *x; 568 ierr = VecZeroEntries(Xloc); CHKERRQ(ierr); 569 ierr = VecSet(X, 1.0); CHKERRQ(ierr); 570 ierr = VecScatterBegin(ltog0, Xloc, X, INSERT_VALUES, SCATTER_FORWARD); 571 CHKERRQ(ierr); 572 ierr = VecScatterEnd(ltog0, Xloc, X, INSERT_VALUES, SCATTER_FORWARD); 573 CHKERRQ(ierr); 574 ierr = VecGetOwnershipRange(X, &xstart, &xend); CHKERRQ(ierr); 575 ierr = PetscMalloc1(xend-xstart, &indD); CHKERRQ(ierr); 576 ierr = VecGetArrayRead(X, &x); CHKERRQ(ierr); 577 for (PetscInt i=0; i<xend-xstart; i++) { 578 if (x[i] == 1.) indD[countD++] = xstart + i; 579 } 580 ierr = VecRestoreArrayRead(X, &x); CHKERRQ(ierr); 581 ierr = ISCreateGeneral(comm, countD, indD, PETSC_COPY_VALUES, &isD); 582 CHKERRQ(ierr); 583 ierr = PetscFree(indD); CHKERRQ(ierr); 584 ierr = VecScatterCreate(X, isD, X, isD, >ogD); CHKERRQ(ierr); 585 ierr = ISDestroy(&isD); CHKERRQ(ierr); 586 } 587 ierr = ISDestroy(<ogis); CHKERRQ(ierr); 588 ierr = ISDestroy(<ogis0); CHKERRQ(ierr); 589 ierr = ISDestroy(&locis); CHKERRQ(ierr); 590 } 591 592 // CEED bases 593 CeedBasisCreateTensorH1Lagrange(ceed, dim, ncompu, P, Q, 594 bpOptions[bpchoice].qmode, &basisu); 595 CeedBasisCreateTensorH1Lagrange(ceed, dim, ncompx, 2, Q, 596 bpOptions[bpchoice].qmode, &basisx); 597 598 // CEED restrictions 599 CreateRestriction(ceed, CEED_INTERLACED, melem, P, ncompu, &Erestrictu); 600 CreateRestriction(ceed, CEED_NONINTERLACED, melem, 2, dim, &Erestrictx); 601 CeedInt nelem = melem[0]*melem[1]*melem[2]; 602 CeedElemRestrictionCreateStrided(ceed, nelem, Q*Q*Q, nelem*Q*Q*Q, ncompu, 603 CEED_STRIDES_BACKEND, &Erestrictui); 604 CeedElemRestrictionCreateStrided(ceed, nelem, Q*Q*Q, nelem*Q*Q*Q, 605 bpOptions[bpchoice].qdatasize, 606 CEED_STRIDES_BACKEND, &Erestrictqdi); 607 { 608 CeedScalar *xloc; 609 CeedInt shape[3] = {melem[0]+1, melem[1]+1, melem[2]+1}, len = 610 shape[0]*shape[1]*shape[2]; 611 xloc = malloc(len*ncompx*sizeof xloc[0]); 612 for (CeedInt i=0; i<shape[0]; i++) { 613 for (CeedInt j=0; j<shape[1]; j++) { 614 for (CeedInt k=0; k<shape[2]; k++) { 615 xloc[((i*shape[1]+j)*shape[2]+k) + 0*len] = 1.*(irank[0]*melem[0]+i) / 616 (p[0]*melem[0]); 617 xloc[((i*shape[1]+j)*shape[2]+k) + 1*len] = 1.*(irank[1]*melem[1]+j) / 618 (p[1]*melem[1]); 619 xloc[((i*shape[1]+j)*shape[2]+k) + 2*len] = 1.*(irank[2]*melem[2]+k) / 620 (p[2]*melem[2]); 621 } 622 } 623 } 624 CeedVectorCreate(ceed, len*ncompx, &xcoord); 625 CeedVectorSetArray(xcoord, CEED_MEM_HOST, CEED_OWN_POINTER, xloc); 626 } 627 628 // Create the Qfunction that builds the operator quadrature data 629 CeedQFunctionCreateInterior(ceed, 1, bpOptions[bpchoice].setupgeo, 630 bpOptions[bpchoice].setupgeofname, &qfsetupgeo); 631 CeedQFunctionAddInput(qfsetupgeo, "dx", ncompx*dim, CEED_EVAL_GRAD); 632 CeedQFunctionAddInput(qfsetupgeo, "weight", 1, CEED_EVAL_WEIGHT); 633 CeedQFunctionAddOutput(qfsetupgeo, "qdata", bpOptions[bpchoice].qdatasize, 634 CEED_EVAL_NONE); 635 636 // Create the Qfunction that sets up the RHS and true solution 637 CeedQFunctionCreateInterior(ceed, 1, bpOptions[bpchoice].setuprhs, 638 bpOptions[bpchoice].setuprhsfname, &qfsetuprhs); 639 CeedQFunctionAddInput(qfsetuprhs, "x", ncompx, CEED_EVAL_INTERP); 640 CeedQFunctionAddInput(qfsetuprhs, "dx", ncompx*dim, CEED_EVAL_GRAD); 641 CeedQFunctionAddInput(qfsetuprhs, "weight", 1, CEED_EVAL_WEIGHT); 642 CeedQFunctionAddOutput(qfsetuprhs, "true_soln", ncompu, CEED_EVAL_NONE); 643 CeedQFunctionAddOutput(qfsetuprhs, "rhs", ncompu, CEED_EVAL_INTERP); 644 645 // Set up PDE operator 646 CeedQFunctionCreateInterior(ceed, 1, bpOptions[bpchoice].apply, 647 bpOptions[bpchoice].applyfname, &qfapply); 648 // Add inputs and outputs 649 CeedInt inscale = bpOptions[bpchoice].inmode==CEED_EVAL_GRAD ? 3 : 1; 650 CeedInt outscale = bpOptions[bpchoice].outmode==CEED_EVAL_GRAD ? 3 : 1; 651 CeedQFunctionAddInput(qfapply, "u", ncompu*inscale, 652 bpOptions[bpchoice].inmode); 653 CeedQFunctionAddInput(qfapply, "qdata", bpOptions[bpchoice].qdatasize, 654 CEED_EVAL_NONE); 655 CeedQFunctionAddOutput(qfapply, "v", ncompu*outscale, 656 bpOptions[bpchoice].outmode); 657 658 // Create the error qfunction 659 CeedQFunctionCreateInterior(ceed, 1, bpOptions[bpchoice].error, 660 bpOptions[bpchoice].errorfname, &qferror); 661 CeedQFunctionAddInput(qferror, "u", ncompu, CEED_EVAL_INTERP); 662 CeedQFunctionAddInput(qferror, "true_soln", ncompu, CEED_EVAL_NONE); 663 CeedQFunctionAddOutput(qferror, "error", ncompu, CEED_EVAL_NONE); 664 665 // Create the persistent vectors that will be needed in setup 666 CeedInt nqpts; 667 CeedBasisGetNumQuadraturePoints(basisu, &nqpts); 668 CeedVectorCreate(ceed, bpOptions[bpchoice].qdatasize*nelem*nqpts, &qdata); 669 CeedVectorCreate(ceed, nelem*nqpts*ncompu, &target); 670 CeedVectorCreate(ceed, lsize*ncompu, &rhsceed); 671 672 // Create the operator that builds the quadrature data for the ceed operator 673 CeedOperatorCreate(ceed, qfsetupgeo, CEED_QFUNCTION_NONE, 674 CEED_QFUNCTION_NONE, &opsetupgeo); 675 CeedOperatorSetField(opsetupgeo, "dx", Erestrictx, basisx, 676 CEED_VECTOR_ACTIVE); 677 CeedOperatorSetField(opsetupgeo, "weight", CEED_ELEMRESTRICTION_NONE, basisx, 678 CEED_VECTOR_NONE); 679 CeedOperatorSetField(opsetupgeo, "qdata", Erestrictqdi, 680 CEED_BASIS_COLLOCATED, CEED_VECTOR_ACTIVE); 681 682 // Create the operator that builds the RHS and true solution 683 CeedOperatorCreate(ceed, qfsetuprhs, CEED_QFUNCTION_NONE, 684 CEED_QFUNCTION_NONE, &opsetuprhs); 685 CeedOperatorSetField(opsetuprhs, "x", Erestrictx, basisx, 686 CEED_VECTOR_ACTIVE); 687 CeedOperatorSetField(opsetuprhs, "dx", Erestrictx, basisx, 688 CEED_VECTOR_ACTIVE); 689 CeedOperatorSetField(opsetuprhs, "weight", CEED_ELEMRESTRICTION_NONE, basisx, 690 CEED_VECTOR_NONE); 691 CeedOperatorSetField(opsetuprhs, "true_soln", Erestrictui, 692 CEED_BASIS_COLLOCATED, target); 693 CeedOperatorSetField(opsetuprhs, "rhs", Erestrictu, basisu, 694 CEED_VECTOR_ACTIVE); 695 696 // Create the mass or diff operator 697 CeedOperatorCreate(ceed, qfapply, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, 698 &opapply); 699 CeedOperatorSetField(opapply, "u", Erestrictu, basisu, CEED_VECTOR_ACTIVE); 700 CeedOperatorSetField(opapply, "qdata", Erestrictqdi, CEED_BASIS_COLLOCATED, 701 qdata); 702 CeedOperatorSetField(opapply, "v", Erestrictu, basisu, CEED_VECTOR_ACTIVE); 703 704 // Create the error operator 705 CeedOperatorCreate(ceed, qferror, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, 706 &operror); 707 CeedOperatorSetField(operror, "u", Erestrictu, basisu, CEED_VECTOR_ACTIVE); 708 CeedOperatorSetField(operror, "true_soln", Erestrictui, 709 CEED_BASIS_COLLOCATED, target); 710 CeedOperatorSetField(operror, "error", Erestrictui, CEED_BASIS_COLLOCATED, 711 CEED_VECTOR_ACTIVE); 712 713 // Set up Mat 714 ierr = PetscMalloc1(1, &user); CHKERRQ(ierr); 715 user->comm = comm; 716 user->ltog = ltog; 717 if (bpchoice != CEED_BP1 && bpchoice != CEED_BP2) { 718 user->ltog0 = ltog0; 719 user->gtogD = gtogD; 720 } 721 user->Xloc = Xloc; 722 ierr = VecDuplicate(Xloc, &user->Yloc); CHKERRQ(ierr); 723 CeedVectorCreate(ceed, lsize*ncompu, &user->xceed); 724 CeedVectorCreate(ceed, lsize*ncompu, &user->yceed); 725 user->op = opapply; 726 user->qdata = qdata; 727 user->ceed = ceed; 728 729 ierr = MatCreateShell(comm, mnodes[0]*mnodes[1]*mnodes[2]*ncompu, 730 mnodes[0]*mnodes[1]*mnodes[2]*ncompu, 731 PETSC_DECIDE, PETSC_DECIDE, user, &mat); CHKERRQ(ierr); 732 if (bpchoice == CEED_BP1 || bpchoice == CEED_BP2) { 733 ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Mass); 734 CHKERRQ(ierr); 735 } else { 736 ierr = MatShellSetOperation(mat, MATOP_MULT, (void(*)(void))MatMult_Diff); 737 CHKERRQ(ierr); 738 } 739 ierr = MatCreateVecs(mat, &rhs, NULL); CHKERRQ(ierr); 740 741 // Get RHS vector 742 ierr = VecDuplicate(Xloc, &rhsloc); CHKERRQ(ierr); 743 ierr = VecZeroEntries(rhsloc); CHKERRQ(ierr); 744 ierr = VecGetArray(rhsloc, &r); CHKERRQ(ierr); 745 CeedVectorSetArray(rhsceed, CEED_MEM_HOST, CEED_USE_POINTER, r); 746 747 // Setup qdata, rhs, and target 748 CeedOperatorApply(opsetupgeo, xcoord, qdata, CEED_REQUEST_IMMEDIATE); 749 CeedOperatorApply(opsetuprhs, xcoord, rhsceed, CEED_REQUEST_IMMEDIATE); 750 ierr = CeedVectorSyncArray(rhsceed, CEED_MEM_HOST); CHKERRQ(ierr); 751 CeedVectorDestroy(&xcoord); 752 753 // Gather RHS 754 ierr = VecRestoreArray(rhsloc, &r); CHKERRQ(ierr); 755 ierr = VecZeroEntries(rhs); CHKERRQ(ierr); 756 ierr = VecScatterBegin(ltog, rhsloc, rhs, ADD_VALUES, SCATTER_FORWARD); 757 CHKERRQ(ierr); 758 ierr = VecScatterEnd(ltog, rhsloc, rhs, ADD_VALUES, SCATTER_FORWARD); 759 CHKERRQ(ierr); 760 CeedVectorDestroy(&rhsceed); 761 762 ierr = KSPCreate(comm, &ksp); CHKERRQ(ierr); 763 { 764 PC pc; 765 ierr = KSPGetPC(ksp, &pc); CHKERRQ(ierr); 766 if (bpchoice == CEED_BP1 || bpchoice == CEED_BP2) { 767 ierr = PCSetType(pc, PCJACOBI); CHKERRQ(ierr); 768 ierr = PCJacobiSetType(pc, PC_JACOBI_ROWSUM); CHKERRQ(ierr); 769 } else { 770 ierr = PCSetType(pc, PCNONE); CHKERRQ(ierr); 771 } 772 ierr = KSPSetType(ksp, KSPCG); CHKERRQ(ierr); 773 ierr = KSPSetNormType(ksp, KSP_NORM_NATURAL); CHKERRQ(ierr); 774 ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 775 PETSC_DEFAULT); CHKERRQ(ierr); 776 } 777 ierr = KSPSetFromOptions(ksp); CHKERRQ(ierr); 778 ierr = KSPSetOperators(ksp, mat, mat); CHKERRQ(ierr); 779 // First run, if benchmarking 780 if (benchmark_mode) { 781 ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 1); 782 CHKERRQ(ierr); 783 my_rt_start = MPI_Wtime(); 784 ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); 785 my_rt = MPI_Wtime() - my_rt_start; 786 ierr = MPI_Allreduce(MPI_IN_PLACE, &my_rt, 1, MPI_DOUBLE, MPI_MIN, comm); 787 CHKERRQ(ierr); 788 // Set maxits based on first iteration timing 789 if (my_rt > 0.02) { 790 ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 5); 791 CHKERRQ(ierr); 792 } else { 793 ierr = KSPSetTolerances(ksp, 1e-10, PETSC_DEFAULT, PETSC_DEFAULT, 20); 794 CHKERRQ(ierr); 795 } 796 } 797 // Timed solve 798 ierr = PetscBarrier((PetscObject)ksp); CHKERRQ(ierr); 799 my_rt_start = MPI_Wtime(); 800 ierr = KSPSolve(ksp, rhs, X); CHKERRQ(ierr); 801 my_rt = MPI_Wtime() - my_rt_start; 802 { 803 KSPType ksptype; 804 KSPConvergedReason reason; 805 PetscReal rnorm; 806 PetscInt its; 807 ierr = KSPGetType(ksp, &ksptype); CHKERRQ(ierr); 808 ierr = KSPGetConvergedReason(ksp, &reason); CHKERRQ(ierr); 809 ierr = KSPGetIterationNumber(ksp, &its); CHKERRQ(ierr); 810 ierr = KSPGetResidualNorm(ksp, &rnorm); CHKERRQ(ierr); 811 if (!test_mode || reason < 0 || rnorm > 1e-8) { 812 ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); 813 CHKERRQ(ierr); 814 ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); 815 CHKERRQ(ierr); 816 ierr = PetscPrintf(comm, 817 " KSP:\n" 818 " KSP Type : %s\n" 819 " KSP Convergence : %s\n" 820 " Total KSP Iterations : %D\n" 821 " Final rnorm : %e\n", 822 ksptype, KSPConvergedReasons[reason], its, 823 (double)rnorm); CHKERRQ(ierr); 824 ierr = PetscPrintf(comm, 825 " Performance:\n" 826 " CG Solve Time : %g (%g) sec\n" 827 " DoFs/Sec in CG : %g (%g) million\n", 828 rt_max, rt_min, 1e-6*gsize*its/rt_max, 829 1e-6*gsize*its/rt_min); CHKERRQ(ierr); 830 } 831 if (benchmark_mode && (!test_mode)) { 832 ierr = MPI_Allreduce(&my_rt, &rt_min, 1, MPI_DOUBLE, MPI_MIN, comm); 833 CHKERRQ(ierr); 834 ierr = MPI_Allreduce(&my_rt, &rt_max, 1, MPI_DOUBLE, MPI_MAX, comm); 835 CHKERRQ(ierr); 836 ierr = PetscPrintf(comm, 837 " Performance:\n" 838 " CG Solve Time : %g (%g) sec\n" 839 " DoFs/Sec in CG : %g (%g) million\n", 840 rt_max, rt_min, 1e-6*gsize*its/rt_max, 841 1e-6*gsize*its/rt_min); CHKERRQ(ierr); 842 } 843 } 844 845 { 846 PetscReal maxerror; 847 ierr = ComputeErrorMax(user, operror, X, target, &maxerror); CHKERRQ(ierr); 848 PetscReal tol = 5e-2; 849 if (!test_mode || maxerror > tol) { 850 ierr = PetscPrintf(comm, 851 " Pointwise Error (max) : %e\n", 852 (double)maxerror); CHKERRQ(ierr); 853 } 854 } 855 856 if (write_solution) { 857 PetscViewer vtkviewersoln; 858 859 ierr = PetscViewerCreate(comm, &vtkviewersoln); CHKERRQ(ierr); 860 ierr = PetscViewerSetType(vtkviewersoln, PETSCVIEWERVTK); CHKERRQ(ierr); 861 ierr = PetscViewerFileSetName(vtkviewersoln, "solution.vtk"); CHKERRQ(ierr); 862 ierr = VecView(X, vtkviewersoln); CHKERRQ(ierr); 863 ierr = PetscViewerDestroy(&vtkviewersoln); CHKERRQ(ierr); 864 } 865 866 ierr = VecDestroy(&rhs); CHKERRQ(ierr); 867 ierr = VecDestroy(&rhsloc); CHKERRQ(ierr); 868 ierr = VecDestroy(&X); CHKERRQ(ierr); 869 ierr = VecDestroy(&user->Xloc); CHKERRQ(ierr); 870 ierr = VecDestroy(&user->Yloc); CHKERRQ(ierr); 871 ierr = VecScatterDestroy(<og); CHKERRQ(ierr); 872 ierr = VecScatterDestroy(<og0); CHKERRQ(ierr); 873 ierr = VecScatterDestroy(>ogD); CHKERRQ(ierr); 874 ierr = MatDestroy(&mat); CHKERRQ(ierr); 875 ierr = KSPDestroy(&ksp); CHKERRQ(ierr); 876 877 CeedVectorDestroy(&user->xceed); 878 CeedVectorDestroy(&user->yceed); 879 CeedVectorDestroy(&user->qdata); 880 CeedVectorDestroy(&target); 881 CeedOperatorDestroy(&opsetupgeo); 882 CeedOperatorDestroy(&opsetuprhs); 883 CeedOperatorDestroy(&opapply); 884 CeedOperatorDestroy(&operror); 885 CeedElemRestrictionDestroy(&Erestrictu); 886 CeedElemRestrictionDestroy(&Erestrictx); 887 CeedElemRestrictionDestroy(&Erestrictui); 888 CeedElemRestrictionDestroy(&Erestrictqdi); 889 CeedQFunctionDestroy(&qfsetupgeo); 890 CeedQFunctionDestroy(&qfsetuprhs); 891 CeedQFunctionDestroy(&qfapply); 892 CeedQFunctionDestroy(&qferror); 893 CeedBasisDestroy(&basisu); 894 CeedBasisDestroy(&basisx); 895 CeedDestroy(&ceed); 896 ierr = PetscFree(user); CHKERRQ(ierr); 897 return PetscFinalize(); 898 } 899