#!/usr/bin/env python3 # Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. # All Rights Reserved. See the top-level LICENSE and NOTICE files for details. # # SPDX-License-Identifier: BSD-2-Clause # # This file is part of CEED: http://github.com/ceed import argparse import os import glob import re import shutil import subprocess import pandas as pd import time script_dir = os.path.dirname(os.path.realpath(__file__)) def benchmark(nb, build_cmd, backend, log): # Build for new NB ceed_magma_h = f"{script_dir}/../ceed-magma.h" shutil.copyfile(ceed_magma_h, ceed_magma_h + ".backup") with open(ceed_magma_h, "r") as f: data = f.read() data = re.sub( r".*(#define ceed_magma_queue_sync\(\.\.\.\)).*", r"\1 " + ("hipDeviceSynchronize()" if "hip" in backend else "cudaDeviceSynchronize()"), data) with open(ceed_magma_h, "w") as f: f.write(data) ceed_magma_gemm_selector_cpp = f"{script_dir}/../ceed-magma-gemm-selector.cpp" shutil.copyfile( ceed_magma_gemm_selector_cpp, ceed_magma_gemm_selector_cpp + ".backup") with open(ceed_magma_gemm_selector_cpp, "r") as f: data = f.read() data = re.sub( ".*(#define CEED_AUTOTUNE_RTC_NB).*", r"\1 " + f"{nb}", data) with open(ceed_magma_gemm_selector_cpp, "w") as f: f.write(data) subprocess.run(build_cmd, cwd=f"{script_dir}/../../..") subprocess.run(["make", "tuning", "OPT=-O0"], cwd=f"{script_dir}") shutil.move(ceed_magma_h + ".backup", ceed_magma_h) shutil.move(ceed_magma_gemm_selector_cpp + ".backup", ceed_magma_gemm_selector_cpp) # Run the benchmark with open(log, "w") as f: process = subprocess.run( [f"{script_dir}/tuning", f"{backend}"], stdout=f, stderr=f) csv = pd.read_csv( log, header=None, delim_whitespace=True, names=[ "P", "Q", "N", "Q_COMP", "TRANS", "MFLOPS"]) return csv if __name__ == "__main__": # Command line arguments parser = argparse.ArgumentParser("MAGMA RTC autotuning") parser.add_argument( "-arch", help="Device architecture name for tuning data", required=True) parser.add_argument( "-max-nb", help="Maximum block size NB to consider for autotuning", default=32, type=int) parser.add_argument( "-build-cmd", help="Command used to build libCEED from the source root directory", default="make") parser.add_argument( "-ceed", help="Ceed resource specifier", default="/cpu/self") args = parser.parse_args() nb = 1 while nb <= args.max_nb: # Run the benchmarks start = time.perf_counter() data_nb = benchmark(nb, args.build_cmd, args.ceed, f"{script_dir}/output-nb-{nb}.txt") print( f"Finished benchmarks for NB = {nb}, backend = {args.ceed} ({time.perf_counter() - start} s)") # Save the data for the highest performing NB if nb == 1: data = pd.DataFrame(data_nb) data["NB"] = nb else: idx = data_nb["MFLOPS"] > 1.05 * data["MFLOPS"] data.loc[idx, "NB"] = nb data.loc[idx, "MFLOPS"] = data_nb.loc[idx, "MFLOPS"] # Speed up the search by considering only some values on NB if nb < 2: nb *= 2 elif nb < 8: nb += 2 else: nb += 4 # Print the results with open(f"{script_dir}/{args.arch}_rtc.h", "w") as f: f.write( "////////////////////////////////////////////////////////////////////////////////\n") f.write(f"// auto-generated from data on {args.arch}\n\n") rows = data.loc[data["TRANS"] == 1].to_string(header=False, index=False, justify="right", columns=[ "P", "Q", "N", "Q_COMP", "NB"]).split("\n") f.write( "////////////////////////////////////////////////////////////////////////////////\n") f.write( f"std::vector > drtc_t_{args.arch}" + " = {\n") count = 0 for row in rows: f.write(" {" + re.sub(r"([0-9])(\s+)", r"\1,\2", row) + ("},\n" if count < len(rows) - 1 else "}\n")) count += 1 f.write("};\n\n") rows = data.loc[data["TRANS"] == 0].to_string(header=False, index=False, justify="right", columns=[ "P", "Q", "N", "Q_COMP", "NB"]).split("\n") f.write( "////////////////////////////////////////////////////////////////////////////////\n") f.write( f"std::vector > drtc_n_{args.arch}" + " = {\n") count = 0 for row in rows: f.write(" {" + re.sub(r"([0-9])(\s+)", r"\1,\2", row) + ("},\n" if count < len(rows) - 1 else "}\n")) count += 1 f.write("};\n")