1d16a9515SJunchao Zhang static char help[] = "Benchmarking various accessing methods of DMDA vectors on host\n\n";
2d16a9515SJunchao Zhang
3d16a9515SJunchao Zhang /*
4d16a9515SJunchao Zhang On a machine with AMD EPYC-7452 CPUs, we got this data using one MPI rank and a serial-only Kokkos:
5d16a9515SJunchao Zhang Time (sec.), on Mar. 1, 2022
6d16a9515SJunchao Zhang ------------------------------------------
7d16a9515SJunchao Zhang n PETSc C Kokkos
8d16a9515SJunchao Zhang ------------------------------------------
9d16a9515SJunchao Zhang 32 4.6464E-05 4.7451E-05 1.6880E-04
10d16a9515SJunchao Zhang 64 2.5654E-04 2.5164E-04 5.6780E-04
11d16a9515SJunchao Zhang 128 1.9383E-03 1.8878E-03 4.7938E-03
12d16a9515SJunchao Zhang 256 1.4450E-02 1.3619E-02 3.7778E-02
13d16a9515SJunchao Zhang 512 1.1580E-01 1.1551E-01 2.8428E-01
14d16a9515SJunchao Zhang 1024 1.4179E+00 1.3772E+00 2.2873E+00
15d16a9515SJunchao Zhang
16d16a9515SJunchao Zhang Overall, C is -2% ~ 5% faster than PETSc. But Kokkos is 1.6~3.6x slower than PETSc
17d16a9515SJunchao Zhang */
18d16a9515SJunchao Zhang
19d16a9515SJunchao Zhang #include <petscdmda_kokkos.hpp>
20d16a9515SJunchao Zhang #include <petscdm.h>
21d16a9515SJunchao Zhang #include <petscdmda.h>
2245402d8aSJunchao Zhang #include <Kokkos_DualView.hpp>
23d16a9515SJunchao Zhang
24209362dfSJunchao Zhang using Kokkos::Iterate;
25209362dfSJunchao Zhang using Kokkos::MDRangePolicy;
26209362dfSJunchao Zhang using Kokkos::Rank;
2745402d8aSJunchao Zhang using HostMirrorMemorySpace = Kokkos::DualView<PetscScalar *>::host_mirror_space::memory_space;
2845402d8aSJunchao Zhang using PetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<PetscScalar ***, Kokkos::LayoutRight, HostMirrorMemorySpace>;
2945402d8aSJunchao Zhang using ConstPetscScalarKokkosOffsetView3D = Kokkos::Experimental::OffsetView<const PetscScalar ***, Kokkos::LayoutRight, HostMirrorMemorySpace>;
30d16a9515SJunchao Zhang
31d16a9515SJunchao Zhang /* PETSc multi-dimensional array access */
Update1(DM da,const PetscScalar *** __restrict__ x1,PetscScalar *** __restrict__ y1,PetscInt nwarm,PetscInt nloop,PetscLogDouble * avgTime)32d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update1(DM da, const PetscScalar ***__restrict__ x1, PetscScalar ***__restrict__ y1, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
33d71ae5a4SJacob Faibussowitsch {
34d16a9515SJunchao Zhang PetscInt it, i, j, k;
35d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend;
36d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
37d16a9515SJunchao Zhang
38d16a9515SJunchao Zhang PetscFunctionBegin;
399566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
409566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
41d16a9515SJunchao Zhang for (it = 0; it < nwarm + nloop; it++) {
429566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart));
43d16a9515SJunchao Zhang for (k = zs; k < zs + zm; k++) {
44d16a9515SJunchao Zhang for (j = ys; j < ys + ym; j++) {
45ad540459SPierre Jolivet for (i = xs; i < xs + xm; i++) y1[k][j][i] = 6 * x1[k][j][i] - x1[k - 1][j][i] - x1[k][j - 1][i] - x1[k][j][i - 1] - x1[k + 1][j][i] - x1[k][j + 1][i] - x1[k][j][i + 1];
46d16a9515SJunchao Zhang }
47d16a9515SJunchao Zhang }
48d16a9515SJunchao Zhang }
499566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend));
50d16a9515SJunchao Zhang *avgTime = (tend - tstart) / nloop;
513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
52d16a9515SJunchao Zhang }
53d16a9515SJunchao Zhang
54d16a9515SJunchao Zhang /* C multi-dimensional array access */
Update2(DM da,const PetscScalar * __restrict__ x2,PetscScalar * __restrict__ y2,PetscInt nwarm,PetscInt nloop,PetscLogDouble * avgTime)55d71ae5a4SJacob Faibussowitsch static PetscErrorCode Update2(DM da, const PetscScalar *__restrict__ x2, PetscScalar *__restrict__ y2, PetscInt nwarm, PetscInt nloop, PetscLogDouble *avgTime)
56d71ae5a4SJacob Faibussowitsch {
57d16a9515SJunchao Zhang PetscInt it, i, j, k;
58d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend;
59d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
60d16a9515SJunchao Zhang
61d16a9515SJunchao Zhang PetscFunctionBegin;
629566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
639566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
64d16a9515SJunchao Zhang #define X2(k, j, i) x2[(k - gzs) * gym * gxm + (j - gys) * gxm + (i - gxs)]
65d16a9515SJunchao Zhang #define Y2(k, j, i) y2[(k - zs) * ym * xm + (j - ys) * xm + (i - xs)]
66d16a9515SJunchao Zhang for (it = 0; it < nwarm + nloop; it++) {
679566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart));
68d16a9515SJunchao Zhang for (k = zs; k < zs + zm; k++) {
69d16a9515SJunchao Zhang for (j = ys; j < ys + ym; j++) {
70ad540459SPierre Jolivet for (i = xs; i < xs + xm; i++) Y2(k, j, i) = 6 * X2(k, j, i) - X2(k - 1, j, i) - X2(k, j - 1, i) - X2(k, j, i - 1) - X2(k + 1, j, i) - X2(k, j + 1, i) - X2(k, j, i + 1);
71d16a9515SJunchao Zhang }
72d16a9515SJunchao Zhang }
73d16a9515SJunchao Zhang }
749566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend));
75d16a9515SJunchao Zhang *avgTime = (tend - tstart) / nloop;
76d16a9515SJunchao Zhang #undef X2
77d16a9515SJunchao Zhang #undef Y2
783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
79d16a9515SJunchao Zhang }
80d16a9515SJunchao Zhang
main(int argc,char ** argv)81d71ae5a4SJacob Faibussowitsch int main(int argc, char **argv)
82d71ae5a4SJacob Faibussowitsch {
83d16a9515SJunchao Zhang DM da;
84d16a9515SJunchao Zhang PetscInt xm, ym, zm, xs, ys, zs, gxm, gym, gzm, gxs, gys, gzs;
85d16a9515SJunchao Zhang PetscInt dof = 1, sw = 1;
86d16a9515SJunchao Zhang DMBoundaryType bx = DM_BOUNDARY_PERIODIC, by = DM_BOUNDARY_PERIODIC, bz = DM_BOUNDARY_PERIODIC;
87d16a9515SJunchao Zhang DMDAStencilType st = DMDA_STENCIL_STAR;
88d16a9515SJunchao Zhang Vec x, y; /* local/global vectors of the da */
89d16a9515SJunchao Zhang PetscRandom rctx;
90d16a9515SJunchao Zhang const PetscScalar ***x1;
91d16a9515SJunchao Zhang PetscScalar ***y1; /* arrays of g, ll */
92d16a9515SJunchao Zhang const PetscScalar *x2;
93d16a9515SJunchao Zhang PetscScalar *y2;
94d16a9515SJunchao Zhang ConstPetscScalarKokkosOffsetView3D x3;
95d16a9515SJunchao Zhang PetscScalarKokkosOffsetView3D y3;
96d16a9515SJunchao Zhang PetscLogDouble tstart = 0.0, tend = 0.0, avgTime = 0.0;
97d16a9515SJunchao Zhang PetscInt nwarm = 2, nloop = 10;
98d16a9515SJunchao Zhang PetscInt min = 32, max = 32 * 8; /* min and max sizes of the grids to sample */
99d16a9515SJunchao Zhang
100327415f7SBarry Smith PetscFunctionBeginUser;
101c8025a54SPierre Jolivet PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
1029566063dSJacob Faibussowitsch PetscCall(PetscRandomCreate(PETSC_COMM_WORLD, &rctx));
1039566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-min", &min, NULL));
1049566063dSJacob Faibussowitsch PetscCall(PetscOptionsGetInt(NULL, NULL, "-max", &max, NULL));
105d16a9515SJunchao Zhang
106d16a9515SJunchao Zhang for (PetscInt len = min; len <= max; len = len * 2) {
1079566063dSJacob Faibussowitsch PetscCall(DMDACreate3d(PETSC_COMM_WORLD, bx, by, bz, st, len, len, len, PETSC_DECIDE, PETSC_DECIDE, PETSC_DECIDE, dof, sw, 0, 0, 0, &da));
1089566063dSJacob Faibussowitsch PetscCall(DMSetFromOptions(da));
1099566063dSJacob Faibussowitsch PetscCall(DMSetUp(da));
110d16a9515SJunchao Zhang
1119566063dSJacob Faibussowitsch PetscCall(DMDAGetCorners(da, &xs, &ys, &zs, &xm, &ym, &zm));
1129566063dSJacob Faibussowitsch PetscCall(DMDAGetGhostCorners(da, &gxs, &gys, &gzs, &gxm, &gym, &gzm));
1139566063dSJacob Faibussowitsch PetscCall(DMCreateLocalVector(da, &x)); /* Create local x and global y */
1149566063dSJacob Faibussowitsch PetscCall(DMCreateGlobalVector(da, &y));
115d16a9515SJunchao Zhang
116f0b74427SPierre Jolivet /* Access with PETSc multi-dimensional arrays */
1179566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx));
1189566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0));
1199566063dSJacob Faibussowitsch PetscCall(DMDAVecGetArrayRead(da, x, &x1));
1209566063dSJacob Faibussowitsch PetscCall(DMDAVecGetArray(da, y, &y1));
1219566063dSJacob Faibussowitsch PetscCall(Update1(da, x1, y1, nwarm, nloop, &avgTime));
1229566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreArrayRead(da, x, &x1));
1239566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreArray(da, y, &y1));
1249566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend));
125309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- PETSc average time = %e\n", static_cast<int>(len), avgTime));
126d16a9515SJunchao Zhang
127d16a9515SJunchao Zhang /* Access with C multi-dimensional arrays */
1289566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx));
1299566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0));
1309566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(x, &x2));
1319566063dSJacob Faibussowitsch PetscCall(VecGetArray(y, &y2));
1329566063dSJacob Faibussowitsch PetscCall(Update2(da, x2, y2, nwarm, nloop, &avgTime));
1339566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(x, &x2));
1349566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(y, &y2));
135309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- C average time = %e\n", static_cast<int>(len), avgTime));
136d16a9515SJunchao Zhang
137d16a9515SJunchao Zhang /* Access with Kokkos multi-dimensional OffsetViews */
1389566063dSJacob Faibussowitsch PetscCall(VecSet(y, 0.0));
1399566063dSJacob Faibussowitsch PetscCall(VecSetRandom(x, rctx));
1409566063dSJacob Faibussowitsch PetscCall(DMDAVecGetKokkosOffsetView(da, x, &x3));
1419566063dSJacob Faibussowitsch PetscCall(DMDAVecGetKokkosOffsetView(da, y, &y3));
142d16a9515SJunchao Zhang
143d16a9515SJunchao Zhang for (PetscInt it = 0; it < nwarm + nloop; it++) {
1449566063dSJacob Faibussowitsch if (it == nwarm) PetscCall(PetscTime(&tstart));
1459371c9d4SSatish Balay Kokkos::parallel_for(
1469371c9d4SSatish Balay "stencil", MDRangePolicy<Kokkos::DefaultHostExecutionSpace, Rank<3, Iterate::Right, Iterate::Right>>({zs, ys, xs}, {zs + zm, ys + ym, xs + xm}),
1479371c9d4SSatish Balay KOKKOS_LAMBDA(PetscInt k, PetscInt j, PetscInt i) { y3(k, j, i) = 6 * x3(k, j, i) - x3(k - 1, j, i) - x3(k, j - 1, i) - x3(k, j, i - 1) - x3(k + 1, j, i) - x3(k, j + 1, i) - x3(k, j, i + 1); });
148d16a9515SJunchao Zhang }
1499566063dSJacob Faibussowitsch PetscCall(PetscTime(&tend));
1509566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreKokkosOffsetView(da, x, &x3));
1519566063dSJacob Faibussowitsch PetscCall(DMDAVecRestoreKokkosOffsetView(da, y, &y3));
152d16a9515SJunchao Zhang avgTime = (tend - tstart) / nloop;
153309cecaaSJunchao Zhang PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%4d^3 -- Kokkos average time = %e\n", static_cast<int>(len), avgTime));
154d16a9515SJunchao Zhang
1559566063dSJacob Faibussowitsch PetscCall(VecDestroy(&x));
1569566063dSJacob Faibussowitsch PetscCall(VecDestroy(&y));
1579566063dSJacob Faibussowitsch PetscCall(DMDestroy(&da));
158d16a9515SJunchao Zhang }
1599566063dSJacob Faibussowitsch PetscCall(PetscRandomDestroy(&rctx));
1609566063dSJacob Faibussowitsch PetscCall(PetscFinalize());
161b122ec5aSJacob Faibussowitsch return 0;
162d16a9515SJunchao Zhang }
163d16a9515SJunchao Zhang
164d16a9515SJunchao Zhang /*TEST
165d16a9515SJunchao Zhang build:
166d16a9515SJunchao Zhang requires: kokkos_kernels
167d16a9515SJunchao Zhang
168d16a9515SJunchao Zhang test:
169d16a9515SJunchao Zhang suffix: 1
170d16a9515SJunchao Zhang requires: kokkos_kernels
171d16a9515SJunchao Zhang args: -min 32 -max 32 -dm_vec_type kokkos
172d16a9515SJunchao Zhang filter: grep -v "time"
173*3886731fSPierre Jolivet output_file: output/empty.out
174d16a9515SJunchao Zhang
175d16a9515SJunchao Zhang TEST*/
176