Skip to content

Overhaul Benchmarking pipeline to use complete sample data, not summaries #61559

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 66 additions & 62 deletions benchmark/scripts/Benchmark_Driver
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,10 @@ class BenchmarkDriver(object):
def test_harness(self):
"""Full path to test harness binary."""
suffix = self.args.optimization if hasattr(self.args, "optimization") else "O"
suffix += "-"
if hasattr(self.args, "architecture") and self.args.architecture:
suffix += "-" + self.args.architecture + "*"
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix)
suffix += self.args.architecture
pattern = os.path.join(self.args.tests, "Benchmark_" + suffix + "*")
executables = []
if hasattr(self._subprocess, "test_mode") and self._subprocess.test_mode:
executables = [pattern]
Expand Down Expand Up @@ -134,48 +135,52 @@ class BenchmarkDriver(object):

@property
def _cmd_list_benchmarks(self):
# Use tab delimiter for easier parsing to override the default comma.
# (The third 'column' is always comma-separated list of tags in square
# brackets -- currently unused here.)
return [self.test_harness, "--list", "--delim=\t"] + (
# TODO: Switch to JSON format: add "--json" here
return [self.test_harness, "--list"] + (
["--skip-tags="] if (self.args.benchmarks or self.args.filters) else []
)

def _get_tests(self):
"""Return a list of performance tests to run."""
number_name_pairs = [
line.split("\t")[:2]
for line in self._invoke(self._cmd_list_benchmarks).split("\n")[1:-1]
]
# unzip list of pairs into 2 lists
test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
self.test_number = dict(zip(self.all_tests, test_numbers))
lines = self._invoke(self._cmd_list_benchmarks).split("\n")
json_tests = []
for line in lines:
columns = re.split(r'[ ,]+', line.strip())
try:
number = int(columns[0])
name = columns[1]
json_descr = {"number": number, "name": name}
json_tests.append(json_descr)
except Exception:
continue
# TODO: Replace the above with the following to
# use the JSON output from the benchmark driver
# directly
# if line.strip() != "":
# json_tests.append(json.loads(line))
self.all_tests = [json["name"] for json in json_tests]
test_numbers = [json["number"] for json in json_tests]
self.test_number = dict([(json["name"], json["number"]) for json in json_tests])
if self.args.filters:
return self._tests_matching_patterns()
if self.args.benchmarks:
return self._tests_by_name_or_number(test_numbers)
return self.all_tests

def _tests_matching_patterns(self):
regexes = [re.compile(pattern) for pattern in self.args.filters]
return sorted(
list(
set(
[
name
for pattern in regexes
for name in self.all_tests
if pattern.match(name)
]
)
)
)
matches = set()
for fil in self.args.filters:
pattern = re.compile(fil)
new_matches = filter(pattern.match, self.all_tests)
matches = matches.union(new_matches)
return sorted(list(matches))

def _tests_by_name_or_number(self, test_numbers):
benchmarks = set(self.args.benchmarks)
number_to_name = dict(zip(test_numbers, self.all_tests))
numbers = list(map(str, test_numbers))
number_to_name = dict(zip(numbers, self.all_tests))
tests_by_number = [
number_to_name[i] for i in benchmarks.intersection(set(test_numbers))
number_to_name[i] for i in benchmarks.intersection(numbers)
]
return sorted(
list(benchmarks.intersection(set(self.all_tests)).union(tests_by_number))
Expand All @@ -188,20 +193,22 @@ class BenchmarkDriver(object):
num_iters=None,
sample_time=None,
verbose=None,
measure_memory=False,
quantile=None,
measure_memory=False
):
"""Execute benchmark and gather results."""
num_samples = num_samples or 0
num_iters = num_iters or 0 # automatically determine N to run for 1s
sample_time = sample_time or 0 # default is 1s

cmd = self._cmd_run(
test, num_samples, num_iters, sample_time, verbose, measure_memory, quantile
test, num_samples, num_iters, sample_time, verbose, measure_memory
)
output = self._invoke(cmd)
results = self.parser.results_from_string(output)
return list(results.items())[0][1] if test else results
if test:
return list(results.items())[0][1]
else:
return results

def _cmd_run(
self,
Expand All @@ -210,14 +217,13 @@ class BenchmarkDriver(object):
num_iters,
sample_time,
verbose,
measure_memory,
quantile,
measure_memory
):
cmd = [self.test_harness]
if test:
cmd.append(test)
else:
cmd.extend([self.test_number.get(name, name) for name in self.tests])
cmd.extend([str(self.test_number.get(name, name)) for name in self.tests])
if num_samples > 0:
cmd.append("--num-samples={0}".format(num_samples))
if num_iters > 0:
Expand All @@ -228,9 +234,8 @@ class BenchmarkDriver(object):
cmd.append("--verbose")
if measure_memory:
cmd.append("--memory")
if quantile:
cmd.append("--quantile={0}".format(quantile))
cmd.append("--delta")
# TODO: Uncomment this as soon as the new Benchmark Swift logic is available everywhere
# cmd.append("--json")
return cmd

def run_independent_samples(self, test):
Expand All @@ -246,12 +251,12 @@ class BenchmarkDriver(object):
return functools.reduce(
merge_results,
[
self.run(test, measure_memory=True, num_iters=1, quantile=20)
self.run(test, measure_memory=True, num_iters=1)
for _ in range(self.args.independent_samples)
],
)

def log_results(self, output, log_file=None):
def log_results(self, results, log_file=None):
"""Log output to `log_file`.

Creates `args.output_dir` if it doesn't exist yet.
Expand All @@ -262,7 +267,8 @@ class BenchmarkDriver(object):
os.makedirs(dir)
print("Logging results to: %s" % log_file)
with open(log_file, "w") as f:
f.write(output)
for r in results:
print(r, file=f)

RESULT = "{:>3} {:<40} {:>7} {:>7} {:>6} {:>10} {:>6} {:>7} {:>10}"

Expand All @@ -284,25 +290,25 @@ class BenchmarkDriver(object):
def console_log(values):
print(format(values))

def result_values(r):
def summary(r):
return list(
map(
str,
[
r.test_num,
r.name,
r.num_samples,
r.min,
r.samples.q1,
r.min_value,
r.q1,
r.median,
r.samples.q3,
r.max,
r.q3,
r.max_value,
r.max_rss,
],
)
)

header = [
summary_header = [
"#",
"TEST",
"SAMPLES",
Expand All @@ -313,25 +319,23 @@ class BenchmarkDriver(object):
"MAX(μs)",
"MAX_RSS(B)",
]
console_log(header)
results = [header]
console_log(summary_header)
results = []
for test in self.tests:
result = result_values(self.run_independent_samples(test))
console_log(result)
result = self.run_independent_samples(test)
console_log(summary(result))
results.append(result)

print("\nTotal performance tests executed: {0}".format(len(self.tests)))
return (
None if csv_console else ("\n".join([",".join(r) for r in results]) + "\n")
) # csv_log
return results

@staticmethod
def run_benchmarks(args):
"""Run benchmarks and log results."""
driver = BenchmarkDriver(args)
csv_log = driver.run_and_log(csv_console=(args.output_dir is None))
if csv_log:
driver.log_results(csv_log)
results = driver.run_and_log(csv_console=(args.output_dir is None))
if args.output_dir:
driver.log_results([r.json for r in results])
return 0


Expand Down Expand Up @@ -445,7 +449,6 @@ class BenchmarkDoctor(object):
Optional `driver` parameter for injecting dependency; used for testing.
"""
super(BenchmarkDoctor, self).__init__()
self.driver = driver or BenchmarkDriver(args)
self.results = {}

if hasattr(args, "markdown") and args.markdown:
Expand All @@ -458,6 +461,7 @@ class BenchmarkDoctor(object):
self.console_handler.setLevel(
logging.DEBUG if args.verbose else logging.INFO
)
self.driver = driver or BenchmarkDriver(args)
self.log.addHandler(self.console_handler)
self.log.debug("Checking tests: %s", ", ".join(self.driver.tests))
self.requirements = [
Expand Down Expand Up @@ -532,7 +536,7 @@ class BenchmarkDoctor(object):
correction = setup / i
i_series = BenchmarkDoctor._select(measurements, num_iters=i)
for result in i_series:
runtimes.append(result.samples.min - correction)
runtimes.append(result.min_value - correction)
runtime = min(runtimes)

threshold = 1000
Expand Down Expand Up @@ -584,7 +588,7 @@ class BenchmarkDoctor(object):
ti1, ti2 = [
float(min(mins))
for mins in [
[result.samples.min for result in i_series]
[result.min_value for result in i_series]
for i_series in [select(measurements, num_iters=i) for i in [1, 2]]
]
]
Expand Down Expand Up @@ -679,7 +683,7 @@ class BenchmarkDoctor(object):
r = self.driver.run(
benchmark, num_samples=3, num_iters=1, verbose=True
) # calibrate
num_samples = self._adjusted_1s_samples(r.samples.min)
num_samples = self._adjusted_1s_samples(r.min_value)

def capped(s):
return min(s, 200)
Expand All @@ -689,7 +693,7 @@ class BenchmarkDoctor(object):
opts = opts if isinstance(opts, list) else [opts]
self.log.debug(
"Runtime {0} μs yields {1} adjusted samples per second.".format(
r.samples.min, num_samples
r.min_value, num_samples
)
)
self.log.debug(
Expand Down
Loading