xref: /libCEED/backends/magma/tuning/generate_tuning.py (revision 26bdecf31bec54c17c7af5a516affbb56f2e8d19)
1#!/usr/bin/env python3
2
3# Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
4# Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
5# All Rights reserved. See files LICENSE and NOTICE for details.
6#
7# This file is part of CEED, a collection of benchmarks, miniapps, software
8# libraries and APIs for efficient high-order finite element and spectral
9# element discretizations for exascale applications. For more information and
10# source code availability see http://github.com/ceed
11#
12# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
13# a collaborative effort of two U.S. Department of Energy organizations (Office
14# of Science and the National Nuclear Security Administration) responsible for
15# the planning and preparation of a capable exascale ecosystem, including
16# software, applications, hardware, advanced system engineering and early
17# testbed platforms, in support of the nation's exascale computing imperative.
18
19import argparse
20import os
21import io
22import re
23import subprocess
24import pandas as pd
25import time
26
27script_dir = os.path.dirname(os.path.realpath(__file__))
28
29
30def build(nb, build_cmd):
31    with open(f"{script_dir}/../ceed-magma-gemm-selector.cpp", 'r') as f:
32        data = f.read()
33        data = re.sub(
34            '.*(#define CEED_AUTOTUNE_RTC_NB).*',
35            r'\1' + f" {nb}",
36            data)
37    with open(f"{script_dir}/../ceed-magma-gemm-selector.cpp", 'w') as f:
38        f.write(data)
39    subprocess.run(build_cmd, cwd=f"{script_dir}/../../..")
40    subprocess.run(["make", "tuning"], cwd=f"{script_dir}")
41
42
43def benchmark(backend):
44    data = subprocess.run(["./tuning", f"{backend}"], capture_output=True)
45    return pd.read_csv(io.StringIO(data.stdout.decode('utf-8')), header=None,
46                       delim_whitespace=True, names=['P', 'N', 'Q', 'Q_COMP', 'TRANS', 'MFLOPS'])
47
48
49if __name__ == "__main__":
50    # Command line arguments
51    parser = argparse.ArgumentParser("MAGMA RTC autotuning")
52    parser.add_argument(
53        "-arch",
54        help="Device architecture name for tuning data",
55        required=True)
56    parser.add_argument(
57        "-max-nb",
58        help="Maximum block size NB to consider for autotuning",
59        default=32,
60        type=int)
61    parser.add_argument(
62        "-ceed",
63        help="Ceed resource specifier",
64        default="/cpu/self")
65    parser.add_argument(
66        "-build-cmd",
67        help="Command used to build libCEED from the source root directory",
68        default="make")
69    args = parser.parse_args()
70
71    for nb in range(1, args.max_nb + 1):
72        # Rebuild the code for the given value of NB
73        build(nb, args.build_cmd)
74
75        # Run the benchmarks
76        start = time.perf_counter()
77        data_nb = benchmark(args.ceed)
78        print(
79            f"Finished benchmarks for NB = {nb}, backend = {args.ceed} ({time.perf_counter() - start} s)")
80
81        # Save the data for the highest performing NB
82        if nb == 1:
83            data = pd.DataFrame(data_nb)
84            data['NB'] = nb
85        else:
86            idx = data_nb['MFLOPS'] > data['MFLOPS']
87            data.loc[idx, 'NB'] = nb
88            data.loc[idx, 'MFLOPS'] = data_nb.loc[idx, 'MFLOPS']
89
90    # Print the results
91    with open(f"{script_dir}/{args.arch}_rtc.h", 'w') as f:
92        f.write(
93            "////////////////////////////////////////////////////////////////////////////////\n")
94        f.write(f"// auto-generated from data on {args.arch}\n\n")
95
96        rows = data.loc[data['TRANS'] == 1].to_string(header=False, index=False, columns=[
97                                                      'P', 'N', 'Q', 'Q_COMP', 'NB']).split('\n')
98        f.write(
99            "////////////////////////////////////////////////////////////////////////////////\n")
100        f.write(
101            f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_t_{args.arch}" +
102            " = {\n")
103        count = 0
104        for row in rows:
105            f.write("    {" + re.sub(r'(\s+)', r',\1', row) +
106                    ("},\n" if count < len(rows) - 1 else "}\n"))
107            count += 1
108        f.write("};\n\n")
109
110        rows = data.loc[data['TRANS'] == 0].to_string(header=False, index=False, columns=[
111                                                      'P', 'N', 'Q', 'Q_COMP', 'NB']).split('\n')
112        f.write(
113            "////////////////////////////////////////////////////////////////////////////////\n")
114        f.write(
115            f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_n_{args.arch}" +
116            " = {\n")
117        count = 0
118        for row in rows:
119            f.write("    {" + re.sub(r'(\s+)', r',\1', row) +
120                    ("},\n" if count < len(rows) - 1 else "}\n"))
121            count += 1
122        f.write("};\n")
123