1 static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n"; 2 3 /* 4 On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos: 5 Time (sec.), on Mar. 1, 2022 6 ------------------------------------------ 7 n PETSc C Kokkos 8 ------------------------------------------ 9 32 4.6464E-05 4.7451E-05 1.6880E-04 10 64 2.5654E-04 2.5164E-04 5.6780E-04 11 128 1.9383E-03 1.8878E-03 4.7938E-03 12 256 1.4450E-02 1.3619E-02 3.7778E-02 13 512 1.1580E-01 1.1551E-01 2.8428E-01 14 1024 1.4179E+00 1.3772E+00 2.2873E+00 15 16 Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc 17 */ 18 19 #include <petscdmda_kokkos.hpp> 20 #include <petscdm.h> 21 #include <petscdmda.h> 22 23 using Kokkos::Iterate; 24 using Kokkos::MDRangePolicy; 25 using Kokkos::Rank; 26 using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>; 27 using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, Kokkos::HostSpace>; 28 29 /* PETSc multi-dimensional array access */ 30 static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) 31 { 32 PetscInt it, i, j, k; 33 PetscLogDouble tstart = 0.0, tend; 34 PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 35 36 PetscFunctionBegin; 37 PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 38 PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 39 for (it = 0; it < nwarm + nloop; it++) { 40 if (it == nwarm) PetscCall(PetscTime(&tstart)); 41 for (k = zs; k < zs + zm; k++) { 42 for (j = ys; j < ys + ym; j++) { 43 for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1]; 44 } 45 } 46 } 47 PetscCall(PetscTime(&tend)); 48 *avgTime = (tend - tstart) / nloop; 49 PetscFunctionReturn(PETSC_SUCCESS); 50 } 51 52 /* C multi-dimensional array access */ 53 static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime) 54 { 55 PetscInt it, i, j, k; 56 PetscLogDouble tstart = 0.0, tend; 57 PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 58 59 PetscFunctionBegin; 60 PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 61 PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 62 #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)] 63 #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)] 64 for (it = 0; it < nwarm + nloop; it++) { 65 if (it == nwarm) PetscCall(PetscTime(&tstart)); 66 for (k = zs; k < zs + zm; k++) { 67 for (j = ys; j < ys + ym; j++) { 68 for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1); 69 } 70 } 71 } 72 PetscCall(PetscTime(&tend)); 73 *avgTime = (tend - tstart) / nloop; 74 #undef X2 75 #undef Y2 76 PetscFunctionReturn(PETSC_SUCCESS); 77 } 78 79 int main(int argc, char **argv) 80 { 81 DM da; 82 PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs; 83 PetscInt dof = 1, sw = 1; 84 DMBoundaryType bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC; 85 DMDAStencilType st = DMDA_STENCIL_STAR; 86 Vec x, y; /* local/global vectors of the da */ 87 PetscRandom rctx; 88 const PetscScalar ***x1; 89 PetscScalar ***y1; /* arrays of g, ll */ 90 const PetscScalar *x2; 91 PetscScalar *y2; 92 ConstPetscScalarKokkosOffsetView3D x3; 93 PetscScalarKokkosOffsetView3D y3; 94 PetscLogDouble tstart = 0.0, tend = 0.0, avgTime = 0.0; 95 PetscInt nwarm = 2, nloop = 10; 96 PetscInt min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */ 97 98 PetscFunctionBeginUser; 99 PetscCall(PetscInitialize(&argc, &argv, nullptr, help)); 100 PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx)); 101 PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL)); 102 PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL)); 103 104 for (PetscInt len = min; len <= max; len = len * 2) { 105 PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da)); 106 PetscCall(DMSetFromOptions(da)); 107 PetscCall(DMSetUp(da)); 108 109 PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm)); 110 PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm)); 111 PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */ 112 PetscCall(DMCreateGlobalVector(da, &y)); 113 114 /* Access with petsc multi-dimensional arrays */ 115 PetscCall(VecSetRandom(x, rctx)); 116 PetscCall(VecSet(y, 0.0)); 117 PetscCall(DMDAVecGetArrayRead(da, x, &x1)); 118 PetscCall(DMDAVecGetArray(da, y, &y1)); 119 PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime)); 120 PetscCall(DMDAVecRestoreArrayRead(da, x, &x1)); 121 PetscCall(DMDAVecRestoreArray(da, y, &y1)); 122 PetscCall(PetscTime(&tend)); 123 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time = %e\n", static_cast<int>(len), avgTime)); 124 125 /* Access with C multi-dimensional arrays */ 126 PetscCall(VecSetRandom(x, rctx)); 127 PetscCall(VecSet(y, 0.0)); 128 PetscCall(VecGetArrayRead(x, &x2)); 129 PetscCall(VecGetArray(y, &y2)); 130 PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime)); 131 PetscCall(VecRestoreArrayRead(x, &x2)); 132 PetscCall(VecRestoreArray(y, &y2)); 133 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time = %e\n", static_cast<int>(len), avgTime)); 134 135 /* Access with Kokkos multi-dimensional OffsetViews */ 136 PetscCall(VecSet(y, 0.0)); 137 PetscCall(VecSetRandom(x, rctx)); 138 PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3)); 139 PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3)); 140 141 for (PetscInt it = 0; it < nwarm + nloop; it++) { 142 if (it == nwarm) PetscCall(PetscTime(&tstart)); 143 Kokkos::parallel_for( 144 "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}), 145 KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); }); 146 } 147 PetscCall(PetscTime(&tend)); 148 PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3)); 149 PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3)); 150 avgTime = (tend - tstart) / nloop; 151 PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", static_cast<int>(len), avgTime)); 152 153 PetscCall(VecDestroy(&x)); 154 PetscCall(VecDestroy(&y)); 155 PetscCall(DMDestroy(&da)); 156 } 157 PetscCall(PetscRandomDestroy(&rctx)); 158 PetscCall(PetscFinalize()); 159 return 0; 160 } 161 162 /*TEST 163 build: 164 requires: kokkos_kernels 165 166 test: 167 suffix: 1 168 requires: kokkos_kernels 169 args: -min 32 -max 32 -dm_vec_type kokkos 170 filter: grep -v "time" 171 172 TEST*/ 173