1#!/usr/bin/env python3 2 3# Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 4# Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 5# All Rights reserved. See files LICENSE and NOTICE for details. 6# 7# This file is part of CEED, a collection of benchmarks, miniapps, software 8# libraries and APIs for efficient high-order finite element and spectral 9# element discretizations for exascale applications. For more information and 10# source code availability see http://github.com/ceed 11# 12# The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 13# a collaborative effort of two U.S. Department of Energy organizations (Office 14# of Science and the National Nuclear Security Administration) responsible for 15# the planning and preparation of a capable exascale ecosystem, including 16# software, applications, hardware, advanced system engineering and early 17# testbed platforms, in support of the nation's exascale computing imperative. 18 19import argparse 20import os 21import io 22import re 23import subprocess 24import pandas as pd 25import time 26 27script_dir = os.path.dirname(os.path.realpath(__file__)) 28 29 30def build(nb, build_cmd): 31 with open(f"{script_dir}/../ceed-magma-gemm-selector.cpp", 'r') as f: 32 data = f.read() 33 data = re.sub( 34 '.*(#define CEED_AUTOTUNE_RTC_NB).*', 35 r'\1' + f" {nb}", 36 data) 37 with open(f"{script_dir}/../ceed-magma-gemm-selector.cpp", 'w') as f: 38 f.write(data) 39 subprocess.run(build_cmd, cwd=f"{script_dir}/../../..") 40 subprocess.run(["make", "tuning"], cwd=f"{script_dir}") 41 42 43def benchmark(backend): 44 data = subprocess.run(["./tuning", f"{backend}"], capture_output=True) 45 return pd.read_csv(io.StringIO(data.stdout.decode('utf-8')), header=None, 46 delim_whitespace=True, names=['P', 'N', 'Q', 'Q_COMP', 'TRANS', 'MFLOPS']) 47 48 49if __name__ == "__main__": 50 # Command line arguments 51 parser = argparse.ArgumentParser("MAGMA RTC autotuning") 52 parser.add_argument( 53 "-arch", 54 help="Device architecture name for tuning data", 55 required=True) 56 parser.add_argument( 57 "-max-nb", 58 help="Maximum block size NB to consider for autotuning", 59 default=32, 60 type=int) 61 parser.add_argument( 62 "-ceed", 63 help="Ceed resource specifier", 64 default="/cpu/self") 65 parser.add_argument( 66 "-build-cmd", 67 help="Command used to build libCEED from the source root directory", 68 default="make") 69 args = parser.parse_args() 70 71 for nb in range(1, args.max_nb + 1): 72 # Rebuild the code for the given value of NB 73 build(nb, args.build_cmd) 74 75 # Run the benchmarks 76 start = time.perf_counter() 77 data_nb = benchmark(args.ceed) 78 print( 79 f"Finished benchmarks for NB = {nb}, backend = {args.ceed} ({time.perf_counter() - start} s)") 80 81 # Save the data for the highest performing NB 82 if nb == 1: 83 data = pd.DataFrame(data_nb) 84 data['NB'] = nb 85 else: 86 idx = data_nb['MFLOPS'] > data['MFLOPS'] 87 data.loc[idx, 'NB'] = nb 88 data.loc[idx, 'MFLOPS'] = data_nb.loc[idx, 'MFLOPS'] 89 90 # Print the results 91 with open(f"{script_dir}/{args.arch}_rtc.h", 'w') as f: 92 f.write( 93 "////////////////////////////////////////////////////////////////////////////////\n") 94 f.write(f"// auto-generated from data on {args.arch}\n\n") 95 96 rows = data.loc[data['TRANS'] == 1].to_string(header=False, index=False, columns=[ 97 'P', 'N', 'Q', 'Q_COMP', 'NB']).split('\n') 98 f.write( 99 "////////////////////////////////////////////////////////////////////////////////\n") 100 f.write( 101 f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_t_{args.arch}" + 102 " = {\n") 103 count = 0 104 for row in rows: 105 f.write(" {" + re.sub(r'(\s+)', r',\1', row) + 106 ("},\n" if count < len(rows) - 1 else "}\n")) 107 count += 1 108 f.write("};\n\n") 109 110 rows = data.loc[data['TRANS'] == 0].to_string(header=False, index=False, columns=[ 111 'P', 'N', 'Q', 'Q_COMP', 'NB']).split('\n') 112 f.write( 113 "////////////////////////////////////////////////////////////////////////////////\n") 114 f.write( 115 f"std::vector<std::array<int, RECORD_LENGTH_RTC> > drtc_n_{args.arch}" + 116 " = {\n") 117 count = 0 118 for row in rows: 119 f.write(" {" + re.sub(r'(\s+)', r',\1', row) + 120 ("},\n" if count < len(rows) - 1 else "}\n")) 121 count += 1 122 f.write("};\n") 123