xref: /libCEED/backends/magma/tuning/tuning.cpp (revision acc0bb127f9d52b89fa0cb7f74c98dc79acc3cb0)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <algorithm>
10 #include <array>
11 #include <chrono>
12 #include <iostream>
13 #include <random>
14 #include <vector>
15 
16 // clang-format off
17 // Triplets of {P, Q, dim}. For now, includes some standard H1 spaces on triangles and tetrahedra, but can be
18 // expanded to more quadrature rules and element types in the future.
19 constexpr static std::array<std::array<int, 3>, 11> PQ_VALUES = {
20     {{3, 1, 2}, {6, 3,  2}, {10, 6,  2}, {15, 12, 2}, {21, 16, 2}, {28, 25, 2}, {36, 33, 2},
21      {4, 1, 3}, {10, 4, 3}, {20, 11, 3}, {35, 24, 3}}
22 };
23 // clang-format on
24 
25 constexpr static std::array<std::pair<int, int>, 7> N_VALUES = {
26     {{1024, 200}, {5120, 200}, {10240, 100}, {51200, 100}, {102400, 50}, {512000, 50}, {1024000, 25}}
27 };
28 
29 using Clock    = std::chrono::steady_clock;
30 using Duration = std::chrono::duration<double>;
31 
32 int main(int argc, char **argv) {
33   Ceed ceed;
34 
35   std::random_device               rand_device;
36   std::default_random_engine       rand_engine(rand_device());
37   std::uniform_real_distribution<> rand_dist(0.0, 1.0);
38   auto                             generate_random = [&rand_dist, &rand_engine]() { return rand_dist(rand_engine); };
39 
40   if (argc < 2) {
41     printf("Usage: ./tuning <CEED_RESOURCE>");
42     return 1;
43   }
44   CeedInit(argv[1], &ceed);
45   CeedSetErrorHandler(ceed, CeedErrorStore);
46 
47   for (const auto [P, Q, dim] : PQ_VALUES) {
48     CeedBasis  basis;
49     CeedVector u, v;
50 
51     std::vector<double> q_ref(dim * Q, 0.0), q_weight(Q, 0.0), interp(P * Q), grad(P * Q * dim);
52     std::generate(interp.begin(), interp.end(), generate_random);
53     std::generate(grad.begin(), grad.end(), generate_random);
54 
55     CeedBasisCreateH1(ceed, (dim < 3) ? CEED_TOPOLOGY_TRIANGLE : CEED_TOPOLOGY_TET, 1, P, Q, interp.data(), grad.data(), q_ref.data(),
56                       q_weight.data(), &basis);
57 
58     for (const auto [N, NUM_TRIALS] : N_VALUES) {
59       double data_interp_n = 0.0, data_interp_t = 0.0, data_grad_n = 0.0, data_grad_t = 0.0;
60       int    ierr;
61 
62       // Interp
63       {
64         CeedVectorCreate(ceed, P * N, &u);
65         CeedVectorCreate(ceed, Q * N, &v);
66 
67         // NoTranspose
68         CeedVectorSetValue(u, 1.0);
69         CeedVectorSetValue(v, 0.0);
70         ierr = CeedBasisApply(basis, N, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, v);
71         if (!ierr) {
72           const auto start = Clock::now();
73           for (int trial = 0; trial < NUM_TRIALS; trial++) {
74             CeedBasisApply(basis, N, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, u, v);
75           }
76           data_interp_n = std::chrono::duration_cast<Duration>(Clock::now() - start).count();
77         }
78 
79         // Transpose
80         CeedVectorSetValue(u, 1.0);
81         CeedVectorSetValue(v, 0.0);
82         ierr = CeedBasisApply(basis, N, CEED_TRANSPOSE, CEED_EVAL_INTERP, v, u);
83         if (!ierr) {
84           const auto start = Clock::now();
85           for (int trial = 0; trial < NUM_TRIALS; trial++) {
86             CeedBasisApply(basis, N, CEED_TRANSPOSE, CEED_EVAL_INTERP, v, u);
87           }
88           data_interp_t = std::chrono::duration_cast<Duration>(Clock::now() - start).count();
89         }
90 
91         CeedVectorDestroy(&u);
92         CeedVectorDestroy(&v);
93       }
94 
95       // Grad
96       {
97         CeedVectorCreate(ceed, P * N, &u);
98         CeedVectorCreate(ceed, dim * Q * N, &v);
99 
100         // NoTranspose
101         CeedVectorSetValue(u, 1.0);
102         CeedVectorSetValue(v, 0.0);
103         ierr = CeedBasisApply(basis, N, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, v);
104         if (!ierr) {
105           const auto start = Clock::now();
106           for (int trial = 0; trial < NUM_TRIALS; trial++) {
107             CeedBasisApply(basis, N, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, u, v);
108           }
109           data_grad_n = std::chrono::duration_cast<Duration>(Clock::now() - start).count();
110         }
111 
112         // Transpose
113         CeedVectorSetValue(u, 1.0);
114         CeedVectorSetValue(v, 0.0);
115         ierr = CeedBasisApply(basis, N, CEED_TRANSPOSE, CEED_EVAL_GRAD, v, u);
116         if (!ierr) {
117           const auto start = Clock::now();
118           for (int trial = 0; trial < NUM_TRIALS; trial++) {
119             CeedBasisApply(basis, N, CEED_TRANSPOSE, CEED_EVAL_GRAD, v, u);
120           }
121           data_grad_t = std::chrono::duration_cast<Duration>(Clock::now() - start).count();
122         }
123 
124         CeedVectorDestroy(&u);
125         CeedVectorDestroy(&v);
126       }
127 
128       // Postprocess and log the data
129       const double  interp_flops = P * Q * (double)N;
130       const double  grad_flops   = P * Q * dim * (double)N;
131       constexpr int width = 12, precision = 2;
132       // clang-format off
133       std::printf("%-*d%-*d%-*d%-*d%-*d%*.*f\n",
134                   width, P, width, Q, width, N, width, 1, width, 0, width, precision,
135                   (data_interp_n > 0.0) ? 1e-6 * NUM_TRIALS * interp_flops / data_interp_n : 0.0);
136       std::printf("%-*d%-*d%-*d%-*d%-*d%*.*f\n",
137                   width, P, width, Q, width, N, width, 1, width, 1, width, precision,
138                   (data_interp_t > 0.0) ? 1e-6 * NUM_TRIALS * interp_flops / data_interp_t : 0.0);
139       std::printf("%-*d%-*d%-*d%-*d%-*d%*.*f\n",
140                   width, P, width, Q, width, N, width, dim, width, 0, width, precision,
141                   (data_grad_n > 0.0) ? 1e-6 * NUM_TRIALS * grad_flops / data_grad_n : 0.0);
142       std::printf("%-*d%-*d%-*d%-*d%-*d%*.*f\n",
143                   width, P, width, Q, width, N, width, dim, width, 1, width, precision,
144                   (data_grad_t > 0.0) ? 1e-6 * NUM_TRIALS * grad_flops / data_grad_t : 0.0);
145       // clang-format on
146     }
147 
148     CeedBasisDestroy(&basis);
149   }
150 
151   CeedDestroy(&ceed);
152   return 0;
153 }
154