1#!/usr/bin/env python3 2 3# Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 4# Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 5# All Rights reserved. See files LICENSE and NOTICE for details. 6# 7# This file is part of CEED, a collection of benchmarks, miniapps, software 8# libraries and APIs for efficient high-order finite element and spectral 9# element discretizations for exascale applications. For more information and 10# source code availability see http://github.com/ceed 11# 12# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 13# a collaborative effort of two U.S. Department of Energy organizations (Office 14# of Science and the National Nuclear Security Administration) responsible for 15# the planning and preparation of a capable exascale ecosystem, including 16# software, applications, hardware, advanced system engineering and early 17# testbed platforms, in support of the nation's exascale computing imperative. 18 19import argparse 20import os 21import glob 22import re 23import shutil 24import subprocess 25import pandas as pd 26import time 27 28script_dir = os.path.dirname(os.path.realpath(__file__)) 29 30 31def benchmark(nb, build_cmd, backend, log): 32 # Build for new NB 33 ceed_magma_h = f"{script_dir}/../ceed-magma.h" 34 shutil.copyfile(ceed_magma_h, ceed_magma_h + ".backup") 35 with open(ceed_magma_h, "r") as f: 36 data = f.read() 37 data = re.sub( 38 r".*(#define ceed_magma_queue_sync\(\.\.\.\)).*", 39 r"\1 " + 40 ("hipDeviceSynchronize()" if "hip" in backend else "cudaDeviceSynchronize()"), 41 data) 42 with open(ceed_magma_h, "w") as f: 43 f.write(data) 44 45 ceed_magma_gemm_selector_cpp = f"{script_dir}/../ceed-magma-gemm-selector.cpp" 46 shutil.copyfile( 47 ceed_magma_gemm_selector_cpp, 48 ceed_magma_gemm_selector_cpp + 49 ".backup") 50 with open(ceed_magma_gemm_selector_cpp, "r") as f: 51 data = f.read() 52 data = re.sub( 53 ".*(#define CEED_AUTOTUNE_RTC_NB).*", 54 r"\1 " + f"{nb}", 55 data) 56 with open(ceed_magma_gemm_selector_cpp, "w") as f: 57 f.write(data) 58 59 subprocess.run(build_cmd, cwd=f"{script_dir}/../../..") 60 subprocess.run(["make", "tuning", "OPT=-O0"], cwd=f"{script_dir}") 61 shutil.move(ceed_magma_h + ".backup", ceed_magma_h) 62 shutil.move(ceed_magma_gemm_selector_cpp + 63 ".backup", ceed_magma_gemm_selector_cpp) 64 65 # Run the benchmark 66 with open(log, "w") as f: 67 process = subprocess.run( 68 [f"{script_dir}/tuning", f"{backend}"], stdout=f, stderr=f) 69 csv = pd.read_csv( 70 log, 71 header=None, 72 delim_whitespace=True, 73 names=[ 74 "P", 75 "Q", 76 "N", 77 "Q_COMP", 78 "TRANS", 79 "MFLOPS"]) 80 return csv 81 82 83if __name__ == "__main__": 84 # Command line arguments 85 parser = argparse.ArgumentParser("MAGMA RTC autotuning") 86 parser.add_argument( 87 "-arch", 88 help="Device architecture name for tuning data", 89 required=True) 90 parser.add_argument( 91 "-max-nb", 92 help="Maximum block size NB to consider for autotuning", 93 default=32, 94 type=int) 95 parser.add_argument( 96 "-build-cmd", 97 help="Command used to build libCEED from the source root directory", 98 default="make") 99 parser.add_argument( 100 "-ceed", 101 help="Ceed resource specifier", 102 default="/cpu/self") 103 args = parser.parse_args() 104 105 nb = 1 106 while nb <= args.max_nb: 107 # Run the benchmarks 108 start = time.perf_counter() 109 data_nb = benchmark(nb, args.build_cmd, args.ceed, 110 f"{script_dir}/output-nb-{nb}.txt") 111 print( 112 f"Finished benchmarks for NB = {nb}, backend = {args.ceed} ({time.perf_counter() - start} s)") 113 114 # Save the data for the highest performing NB 115 if nb == 1: 116 data = pd.DataFrame(data_nb) 117 data["NB"] = nb 118 else: 119 idx = data_nb["MFLOPS"] > 1.05 * data["MFLOPS"] 120 data.loc[idx, "NB"] = nb 121 data.loc[idx, "MFLOPS"] = data_nb.loc[idx, "MFLOPS"] 122 123 # Speed up the search by considering only some values on NB 124 if nb < 2: 125 nb *= 2 126 elif nb < 8: 127 nb += 2 128 else: 129 nb += 4 130 131 # Print the results 132 with open(f"{script_dir}/{args.arch}_rtc.h", "w") as f: 133 f.write( 134 "////////////////////////////////////////////////////////////////////////////////\n") 135 f.write(f"// auto-generated from data on {args.arch}\n\n") 136 137 rows = data.loc[data["TRANS"] == 1].to_string(header=False, index=False, justify="right", columns=[ 138 "P", "Q", "N", "Q_COMP", "NB"]).split("\n") 139 f.write( 140 "////////////////////////////////////////////////////////////////////////////////\n") 141 f.write( 142 f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_t_{args.arch}" + 143 " = {\n") 144 count = 0 145 for row in rows: 146 f.write(" {" + re.sub(r"([0-9])(\s+)", r"\1,\2", row) + 147 ("},\n" if count < len(rows) - 1 else "}\n")) 148 count += 1 149 f.write("};\n\n") 150 151 rows = data.loc[data["TRANS"] == 0].to_string(header=False, index=False, justify="right", columns=[ 152 "P", "Q", "N", "Q_COMP", "NB"]).split("\n") 153 f.write( 154 "////////////////////////////////////////////////////////////////////////////////\n") 155 f.write( 156 f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_n_{args.arch}" + 157 " = {\n") 158 count = 0 159 for row in rows: 160 f.write(" {" + re.sub(r"([0-9])(\s+)", r"\1,\2", row) + 161 ("},\n" if count < len(rows) - 1 else "}\n")) 162 count += 1 163 f.write("};\n") 164