Skip to content

Support Python 3 in the benchmark suite #30085

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmark/scripts/Benchmark_DTrace.in
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
stdout=subprocess.PIPE,
stderr=open("/dev/null", "w"),
env=e,
universal_newlines=True,
)
results = [x for x in p.communicate()[0].split("\n") if len(x) > 0]
return [
Expand Down Expand Up @@ -136,7 +137,9 @@ class DTraceBenchmarkDriver(perf_test_driver.BenchmarkDriver):
results.append(result_3)
results.append(single_iter)

return DTraceResult(test_name, int(not foundInstability), results)
return DTraceResult(
test_name, int(not foundInstability), results, self.csv_output
)
Comment on lines +140 to +142
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DTraceResult's initializer takes exactly 4 arguments. I don't know why it worked before.



SWIFT_BIN_DIR = os.path.dirname(os.path.abspath(__file__))
Expand Down
67 changes: 39 additions & 28 deletions benchmark/scripts/Benchmark_Driver
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class `BenchmarkDoctor` analyzes performance tests, implements `check` COMMAND.
"""

import argparse
import functools
import glob
import logging
import math
Expand Down Expand Up @@ -64,7 +65,9 @@ class BenchmarkDriver(object):
os.environ["SWIFT_DETERMINISTIC_HASHING"] = "1"

def _invoke(self, cmd):
return self._subprocess.check_output(cmd, stderr=self._subprocess.STDOUT)
return self._subprocess.check_output(
cmd, stderr=self._subprocess.STDOUT, universal_newlines=True
)

@property
def test_harness(self):
Expand Down Expand Up @@ -165,7 +168,7 @@ class BenchmarkDriver(object):
)
output = self._invoke(cmd)
results = self.parser.results_from_string(output)
return results.items()[0][1] if test else results
return list(results.items())[0][1] if test else results

def _cmd_run(
self,
Expand Down Expand Up @@ -207,7 +210,7 @@ class BenchmarkDriver(object):
a.merge(b)
return a

return reduce(
return functools.reduce(
merge_results,
[
self.run(test, measure_memory=True, num_iters=1, quantile=20)
Expand Down Expand Up @@ -249,19 +252,21 @@ class BenchmarkDriver(object):
print(format(values))

def result_values(r):
return map(
str,
[
r.test_num,
r.name,
r.num_samples,
r.min,
r.samples.q1,
r.median,
r.samples.q3,
r.max,
r.max_rss,
],
return list(
map(
str,
[
r.test_num,
r.name,
r.num_samples,
r.min,
r.samples.q1,
r.median,
r.samples.q3,
r.max,
r.max_rss,
],
)
)

header = [
Expand Down Expand Up @@ -370,7 +375,12 @@ class MarkdownReportHandler(logging.StreamHandler):
msg = self.format(record)
stream = self.stream
try:
if isinstance(msg, unicode) and getattr(stream, "encoding", None):
# In Python 2 Unicode strings have a special type
unicode_type = unicode
except NameError:
unicode_type = str
try:
if isinstance(msg, unicode_type) and getattr(stream, "encoding", None):
stream.write(msg.encode(stream.encoding))
else:
stream.write(msg)
Expand Down Expand Up @@ -487,16 +497,14 @@ class BenchmarkDoctor(object):
name = measurements["name"]
setup, ratio = BenchmarkDoctor._setup_overhead(measurements)
setup = 0 if ratio < 0.05 else setup
runtime = min(
[
(result.samples.min - correction)
for i_series in [
BenchmarkDoctor._select(measurements, num_iters=i)
for correction in [(setup / i) for i in [1, 2]]
]
for result in i_series
]
)

runtimes = []
for i in range(1, 3):
correction = setup / i
i_series = BenchmarkDoctor._select(measurements, num_iters=i)
for result in i_series:
runtimes.append(result.samples.min - correction)
runtime = min(runtimes)
Comment on lines -490 to +507
Copy link
Contributor Author

@broadwaylamb broadwaylamb Feb 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like whoever will review this to pay attention at this particular change.

The old version relied on Python 2-only feature where the variables defined in a list comprehension leak to the function scope. But I believe this was erroneous: since in Python 2 list comprehensions are eagerly evaluated, the value of i was always 2 when evaluating the BenchmarkDoctor._select(measurements, num_iters=i) expression. Similarly, when evaluating (result.samples.min - correction), the value of correction was always the same, namely, the last value it took while evaluating this expression:

[
    BenchmarkDoctor._select(measurements, num_iters=i)
    for correction in [(setup / i) for i in [1, 2]]
]

Here, I've rewritten it to be a stupid old for loop, which IMHO reads way better, and also seems to behave correctly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm… I’ll need to check this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ve corrected the Python3 incompatibility by explicitly propagating the variables from the nested scopes. I don’t see any issue of incorrect evaluation. Here’s standalone simplified test case base on the original source code before the automatic formatter made mess of it in #29719:

$ python
Python 2.7.10 (default, Feb 22 2019, 21:55:15) 
[GCC 4.2.1 Compatible Apple LLVM 10.0.1 (clang-1001.0.37.14)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> setup = 134
>>> measurements = {1: [[324], [326], [325]], 2: [[257], [259], [258]]}
>>> 
>>> 
>>> def select(measurements, num_iters):
...     return measurements[num_iters]
... 
>>> 
>>> runtimesP3 = [
...     (result[0] - correction) for correction, i_series in
...     [(correction, select(measurements, num_iters=i))
...      for i, correction in [(i, setup // i) for i in [1, 2]]
...      ] for result in i_series]
>>> 
>>> runtimes = [
...     (result[0] - correction) for i_series in
...     [select(measurements, num_iters=i)
...      for correction in [(setup / i) for i in [1, 2]]
...      ] for result in i_series]
>>> 
>>> runtimes == runtimesP3
True
>>> runtimesP3
[190, 192, 191, 190, 192, 191]

It works correctly in Python 3, too:

$ python3
Python 3.6.5 (default, May 24 2018, 10:57:11) 
[GCC 4.2.1 Compatible Apple LLVM 9.1.0 (clang-902.0.39.1)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> setup = 134
>>> measurements = {1: [[324], [326], [325]], 2: [[257], [259], [258]]}
>>> 
>>> 
>>> def select(measurements, num_iters):
...     return measurements[num_iters]
... 
>>> 
>>> runtimesP3 = [
...     (result[0] - correction) for correction, i_series in
...     [(correction, select(measurements, num_iters=i))
...      for i, correction in [(i, setup // i) for i in [1, 2]]
...      ] for result in i_series]
>>> 
>>> runtimesP3
[190, 192, 191, 190, 192, 191]

I’ll try to incorporate the Python 3 changes in #26462 (it already had fix for the assertEqualtats typo...


threshold = 1000
if threshold < runtime:
Expand Down Expand Up @@ -572,7 +580,9 @@ class BenchmarkDoctor(object):

@staticmethod
def _reasonable_setup_time(measurements):
setup = min([result.setup for result in BenchmarkDoctor._select(measurements)])
setup = min(
[result.setup or 0 for result in BenchmarkDoctor._select(measurements)]
)
Comment on lines -575 to +585
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None compares less than anything in Python 2. In Python 3 None is not comparable at all, so I needed to provide a reasonable default value. 0 seems okay, since this function only has effect if setup is greater than 200000.

if 200000 < setup: # 200 ms
BenchmarkDoctor.log_runtime.error(
"'%s' setup took at least %d μs.", measurements["name"], setup
Expand Down Expand Up @@ -857,6 +867,7 @@ def parse_args(args):
help="See COMMAND -h for additional arguments",
metavar="COMMAND",
)
subparsers.required = True

shared_benchmarks_parser = argparse.ArgumentParser(add_help=False)
benchmarks_group = shared_benchmarks_parser.add_mutually_exclusive_group()
Expand Down
3 changes: 2 additions & 1 deletion benchmark/scripts/Benchmark_QuickCheck.in
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
error_out = p.communicate()[1].split("\n")
result = p.returncode
Expand All @@ -76,7 +77,7 @@ class QuickCheckBenchmarkDriver(perf_test_driver.BenchmarkDriver):
try:
args = [data, num_iters]
perf_test_driver.run_with_timeout(self.run_test_inner, args)
except Exception, e:
except Exception as e:
sys.stderr.write(
"Child Process Failed! (%s,%s). Error: %s\n"
% (data["path"], data["test_name"], e)
Expand Down
3 changes: 2 additions & 1 deletion benchmark/scripts/Benchmark_RuntimeLeaksRunner.in
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
error_out = p.communicate()[1].split("\n")
result = p.returncode
Expand All @@ -102,7 +103,7 @@ class LeaksRunnerBenchmarkDriver(perf_test_driver.BenchmarkDriver):
try:
args = [data, num_iters]
result = perf_test_driver.run_with_timeout(self.run_test_inner, args)
except Exception, e:
except Exception as e:
sys.stderr.write(
"Child Process Failed! (%s,%s). Error: %s\n"
% (data["path"], data["test_name"], e)
Expand Down
23 changes: 15 additions & 8 deletions benchmark/scripts/compare_perf_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class `ReportFormatter` creates the test comparison report in specified format.
from __future__ import print_function

import argparse
import functools
import re
import sys
from bisect import bisect, bisect_left, bisect_right
Expand Down Expand Up @@ -142,7 +143,7 @@ def num_samples(self):
@property
def all_samples(self):
"""List of all samples in ascending order."""
return sorted(self.samples + self.outliers, key=lambda s: s.i)
return sorted(self.samples + self.outliers, key=lambda s: s.i or -1)

@property
def min(self):
Expand Down Expand Up @@ -189,13 +190,16 @@ def sd(self):
return 0 if self.count < 2 else sqrt(self.S_runtime / (self.count - 1))

@staticmethod
def running_mean_variance((k, M_, S_), x):
def running_mean_variance(stats, x):
"""Compute running variance, B. P. Welford's method.

See Knuth TAOCP vol 2, 3rd edition, page 232, or
https://www.johndcook.com/blog/standard_deviation/
M is mean, Standard Deviation is defined as sqrt(S/k-1)
"""

(k, M_, S_) = stats

k = float(k + 1)
M = M_ + (x - M_) / k
S = S_ + (x - M_) * (x - M)
Expand Down Expand Up @@ -247,7 +251,7 @@ def __init__(self, csv_row, quantiles=False, memory=False, delta=False, meta=Fal
runtimes = csv_row[3:mem_index] if memory or meta else csv_row[3:]
if delta:
runtimes = [int(x) if x else 0 for x in runtimes]
runtimes = reduce(
runtimes = functools.reduce(
lambda l, x: l.append(l[-1] + x) or l if l else [x], # runnin
runtimes,
None,
Expand Down Expand Up @@ -315,7 +319,8 @@ def merge(self, r):
"""
# Statistics
if self.samples and r.samples:
map(self.samples.add, r.samples.samples)
for sample in r.samples.samples:
self.samples.add(sample)
sams = self.samples
self.num_samples = sams.num_samples
self.min, self.max, self.median, self.mean, self.sd = (
Expand Down Expand Up @@ -490,7 +495,7 @@ def add_or_merge(names, r):
names[r.name].merge(r)
return names

return reduce(add_or_merge, tests, dict())
return functools.reduce(add_or_merge, tests, dict())

@staticmethod
def results_from_string(log_contents):
Expand Down Expand Up @@ -544,10 +549,12 @@ def __init__(self, old_results, new_results, delta_threshold):
def compare(name):
return ResultComparison(old_results[name], new_results[name])

comparisons = map(compare, comparable_tests)
comparisons = list(map(compare, comparable_tests))

def partition(l, p):
return reduce(lambda x, y: x[not p(y)].append(y) or x, l, ([], []))
return functools.reduce(
lambda x, y: x[not p(y)].append(y) or x, l, ([], [])
)

decreased, not_decreased = partition(
comparisons, lambda c: c.ratio < (1 - delta_threshold)
Expand Down Expand Up @@ -668,7 +675,7 @@ def _column_widths(self):
def max_widths(maximum, widths):
return map(max, zip(maximum, widths))

return reduce(max_widths, widths, [0] * 5)
return list(functools.reduce(max_widths, widths, [0] * 5))

def _formatted_text(
self, label_formatter, COLUMN_SEPARATOR, DELIMITER_ROW, SEPARATOR, SECTION
Expand Down
3 changes: 2 additions & 1 deletion benchmark/scripts/perf_test_driver/perf_test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def process_input(self, data):
def run_for_opt_level(self, binary, opt_level, test_filter):
print("testing driver at path: %s" % binary)
names = []
for l in subprocess.check_output([binary, "--list"]).split("\n")[1:]:
output = subprocess.check_output([binary, "--list"], universal_newlines=True)
for l in output.split("\n")[1:]:
m = BENCHMARK_OUTPUT_RE.match(l)
if m is None:
continue
Expand Down
57 changes: 48 additions & 9 deletions benchmark/scripts/test_Benchmark_Driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,33 @@

import logging
import os
import sys
import time
import unittest
from StringIO import StringIO
from imp import load_source

try:
# for Python 2
from StringIO import StringIO
except ImportError:
# for Python 3
from io import StringIO

from compare_perf_tests import PerformanceTestResult

from test_utils import Mock, MockLoggingHandler, Stub, captured_output


# imp.load_source is deprecated in Python 3.4
if sys.version_info < (3, 4):
from imp import load_source
else:

def load_source(name, path):
from importlib.machinery import SourceFileLoader

return SourceFileLoader(name, path).load_module()


# import Benchmark_Driver # doesn't work because it misses '.py' extension
Benchmark_Driver = load_source(
"Benchmark_Driver",
Expand All @@ -46,7 +64,17 @@ def assert_contains(self, texts, output):
def test_requires_command_argument(self):
with captured_output() as (_, err):
self.assertRaises(SystemExit, parse_args, [])
self.assert_contains(["usage:", "COMMAND", "too few arguments"], err.getvalue())

if sys.version_info < (3, 3):
self.assert_contains(
["usage:", "COMMAND", "too few arguments"], err.getvalue()
)
else:
# The error message has changed in Python 3.3
self.assert_contains(
["usage:", "COMMAND", "the following arguments are required"],
err.getvalue(),
)

def test_command_help_lists_commands(self):
with captured_output() as (out, _):
Expand Down Expand Up @@ -151,7 +179,14 @@ class SubprocessMock(Mock):
def __init__(self, responses=None):
super(SubprocessMock, self).__init__(responses)

def _check_output(args, stdin=None, stdout=None, stderr=None, shell=False):
def _check_output(
args,
stdin=None,
stdout=None,
stderr=None,
shell=False,
universal_newlines=False,
):
return self.record_and_respond(args, stdin, stdout, stderr, shell)

self.check_output = _check_output
Expand Down Expand Up @@ -190,8 +225,8 @@ def test_gets_list_of_precommit_benchmarks(self):
self.subprocess_mock.assert_called_all_expected()
self.assertEqual(driver.tests, ["Benchmark1", "Benchmark2"])
self.assertEqual(driver.all_tests, ["Benchmark1", "Benchmark2"])
self.assertEquals(driver.test_number["Benchmark1"], "1")
self.assertEquals(driver.test_number["Benchmark2"], "2")
self.assertEqual(driver.test_number["Benchmark1"], "1")
self.assertEqual(driver.test_number["Benchmark2"], "2")

list_all_tests = (
"/benchmarks/Benchmark_O --list --delim=\t --skip-tags=".split(" "),
Expand Down Expand Up @@ -330,10 +365,10 @@ def test_parse_results_from_running_benchmarks(self):
"""
r = self.driver.run("b")
self.assertTrue(self.parser_stub.results_from_string_called)
self.assertEquals(r.name, "b1") # non-matching name, just 1st result
self.assertEqual(r.name, "b1") # non-matching name, just 1st result
r = self.driver.run()
self.assertTrue(isinstance(r, dict))
self.assertEquals(r["b1"].name, "b1")
self.assertEqual(r["b1"].name, "b1")

def test_measure_memory(self):
self.driver.run("b", measure_memory=True)
Expand Down Expand Up @@ -412,7 +447,11 @@ def test_log_results(self):

def assert_log_written(out, log_file, content):
self.assertEqual(out.getvalue(), "Logging results to: " + log_file + "\n")
with open(log_file, "rU") as f:
if sys.version_info < (3, 0):
openmode = "rU"
else:
openmode = "r" # 'U' mode is deprecated in Python 3
with open(log_file, openmode) as f:
text = f.read()
self.assertEqual(text, "formatted output")

Expand Down
Loading