Merge pull request #21684 from palimondo/cells-interlinked

palimondo · web-flow · commit 8fcf88407694 · 2019-01-08T19:09:11.000+01:00
[benchmark] run_smoke_bench with BenchmarkDriver
diff --git a/benchmark/scripts/Benchmark_Driver b/benchmark/scripts/Benchmark_Driver
@@ -55,6 +55,7 @@ class BenchmarkDriver(object):
         self.args = args
         self._subprocess = _subprocess or subprocess
         self.all_tests = []
+        self.test_number = {}
         self.tests = tests or self._get_tests()
         self.parser = parser or LogParser()
         self.results = {}
@@ -106,16 +107,17 @@ class BenchmarkDriver(object):
 
     def _get_tests(self):
         """Return a list of performance tests to run."""
-        index_name_pairs = [
+        number_name_pairs = [
             line.split('\t')[:2] for line in
             self._invoke(self._cmd_list_benchmarks).split('\n')[1:-1]
         ]
         # unzip list of pairs into 2 lists
-        indices, self.all_tests = map(list, zip(*index_name_pairs))
+        test_numbers, self.all_tests = map(list, zip(*number_name_pairs))
+        self.test_number = dict(zip(self.all_tests, test_numbers))
         if self.args.filters:
             return self._tests_matching_patterns()
         if self.args.benchmarks:
-            return self._tests_by_name_or_index(indices)
+            return self._tests_by_name_or_number(test_numbers)
         return self.all_tests
 
     def _tests_matching_patterns(self):
@@ -124,33 +126,44 @@ class BenchmarkDriver(object):
                                 for name in self.all_tests
                                 if pattern.match(name)])))
 
-    def _tests_by_name_or_index(self, indices):
+    def _tests_by_name_or_number(self, test_numbers):
         benchmarks = set(self.args.benchmarks)
-        index_to_name = dict(zip(indices, self.all_tests))
-        indexed_names = [index_to_name[i]
-                         for i in benchmarks.intersection(set(indices))]
-        return sorted(list(
-            benchmarks.intersection(set(self.all_tests)).union(indexed_names)))
-
-    def run(self, test, num_samples=None, num_iters=None,
-            verbose=None, measure_memory=False, quantile=None):
+        number_to_name = dict(zip(test_numbers, self.all_tests))
+        tests_by_number = [number_to_name[i]
+                           for i in benchmarks.intersection(set(test_numbers))]
+        return sorted(list(benchmarks
+                           .intersection(set(self.all_tests))
+                           .union(tests_by_number)))
+
+    def run(self, test=None, num_samples=None, num_iters=None,
+            sample_time=None, verbose=None, measure_memory=False,
+            quantile=None):
         """Execute benchmark and gather results."""
         num_samples = num_samples or 0
         num_iters = num_iters or 0  # automatically determine N to run for 1s
+        sample_time = sample_time or 0  # default is 1s
 
         cmd = self._cmd_run(
-            test, num_samples, num_iters, verbose, measure_memory, quantile)
+            test, num_samples, num_iters, sample_time,
+            verbose, measure_memory, quantile)
         output = self._invoke(cmd)
-        result = self.parser.results_from_string(output).items()[0][1]
-        return result
-
-    def _cmd_run(self, test, num_samples, num_iters, verbose, measure_memory,
-                 quantile):
-        cmd = [self.test_harness, test]
+        results = self.parser.results_from_string(output)
+        return results.items()[0][1] if test else results
+
+    def _cmd_run(self, test, num_samples, num_iters, sample_time,
+                 verbose, measure_memory, quantile):
+        cmd = [self.test_harness]
+        if test:
+            cmd.append(test)
+        else:
+            cmd.extend([self.test_number.get(name, name)
+                        for name in self.tests])
         if num_samples > 0:
             cmd.append('--num-samples={0}'.format(num_samples))
         if num_iters > 0:
             cmd.append('--num-iters={0}'.format(num_iters))
+        if sample_time > 0:
+            cmd.append('--sample-time={0}'.format(sample_time))
         if verbose:
             cmd.append('--verbose')
         if measure_memory:
@@ -198,7 +211,6 @@ class BenchmarkDriver(object):
         from this method. When `csv_console` is False, the console output
         format is justified columns.
         """
-
         format = (
             (lambda values: ','.join(values)) if csv_console else
             (lambda values: self.RESULT.format(*values)))  # justified columns
diff --git a/benchmark/scripts/run_smoke_bench b/benchmark/scripts/run_smoke_bench
@@ -26,15 +26,34 @@ from __future__ import print_function
 import argparse
 import glob
 import os
-import re
 import subprocess
 import sys
 
-from compare_perf_tests import LogParser, create_report
+from compare_perf_tests import LogParser, TestComparator, create_report
+
+from imp import load_source
+# import Benchmark_Driver  # doesn't work because it misses '.py' extension
+Benchmark_Driver = load_source(
+    'Benchmark_Driver', os.path.join(os.path.dirname(
+        os.path.abspath(__file__)), 'Benchmark_Driver'))
+# from Benchmark_Driver import BenchmarkDriver, BenchmarkDoctor, ...
+BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
+BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor
+MarkdownReportHandler = Benchmark_Driver.MarkdownReportHandler
 
 VERBOSE = False
 
 
+class DriverArgs(object):
+    """Arguments for BenchmarkDriver."""
+    def __init__(self, tests, optimization='O'):
+        """Initialize with path to the build-dir and optimization level."""
+        self.benchmarks = None
+        self.filters = None
+        self.tests = os.path.join(tests, 'bin')
+        self.optimization = optimization
+
+
 def log(msg):
     print(msg)
     sys.stdout.flush()
@@ -129,89 +148,61 @@ def test_opt_levels(args):
     return 0
 
 
-def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
-                     output_file):
-    num_results_dont_differ = 0
-    iter = 1
-    to_test = None
-    prev_num_tests = None
+def measure(driver, tests, i):
+    """Log and measure samples of the tests with the given driver.
 
-    old_lines = ""
-    new_lines = ""
+    Collect increasing number of samples, depending on the iteration.
+    """
+    num_samples = min(i + 3, 10)
+    msg = '    Iteration {0} for {1}: num samples = {2}, '.format(
+        i, driver.args.tests, num_samples)
+    msg += ('running all tests' if driver.all_tests == tests else
+            're-testing {0} tests'.format(len(tests)))
+    log(msg)
+    driver.tests = tests
+    return driver.run(num_samples=num_samples, sample_time=0.0025)
 
-    # #,TEST,SAMPLES,MIN(μs),MAX(μs),MEAN(μs),SD(μs),MEDIAN(μs),PEAK_MEMORY(B)
-    score_re = re.compile(r"(\d+),([\w.\-]+),\d+,(\d+)")
-
-    while to_test is None or len(to_test) > 0:
-        tested_benchmarks = set()
-
-        # (benchmark_name, benchmark_directory) -> (min_value, result_line)
-        values = {}
-
-        # Run the benchmarks and store the results in 'values'.
-        for bench_dir in (old_dir, new_dir):
-            log('    Iteration ' + str(iter) + ' for ' + bench_dir +
-                ': num samples = ' + str(num_samples) +
-                (', running all tests' if to_test is None
-                    else ', re-testing ' + str(len(to_test)) + ' tests'))
-
-            result = get_results(bench_dir, opt_level, num_samples, to_test)
-            for line in result.splitlines():
-                m = score_re.match(line)
-                if m:
-                    testname = m.group(2)
-                    val = int(m.group(3))
-                    values[(testname, bench_dir)] = (val, line)
-                    tested_benchmarks.add(testname)
-
-        # Some local utility functions
-
-        def bench_in(bench, bench_dir):
-            return (bench, bench_dir) in values
-
-        def within_threshold(bench):
-            old_val = values[(bench, old_dir)][0]
-            new_val = values[(bench, new_dir)][0]
-            if not new_val:
-                return True
-            f = float(old_val) / float(new_val)
-            return f >= 1.0 - threshold and f <= 1.0 + threshold
-
-        def result_line(bench, bench_dir):
-            result_line = values[(bench, bench_dir)][1]
-            return result_line + '\n'
-
-        # Check which benchmarks are added/removed and which need to be re-run
-        to_test = []
-        for bench in sorted(tested_benchmarks):
-            if bench_in(bench, old_dir) and not bench_in(bench, new_dir):
-                old_lines += result_line(bench, old_dir)
-            elif bench_in(bench, new_dir) and not bench_in(bench, old_dir):
-                new_lines += result_line(bench, new_dir)
-            elif within_threshold(bench) or num_results_dont_differ >= 4:
-                old_lines += result_line(bench, old_dir)
-                new_lines += result_line(bench, new_dir)
-            else:
-                to_test.append(bench)
-                if VERBOSE:
-                    log('        test again ' + bench)
-
-        # Track how many times we could not reduce the number of benchmarks
-        if prev_num_tests == len(to_test):
-            num_results_dont_differ += 1
-        else:
-            num_results_dont_differ = 0
-        prev_num_tests = len(to_test)
 
-        # Increase the number of samples for benchmarks which re-run
-        if num_samples < 10:
-            num_samples += 1
+def merge(results, other_results):
+    """"Merge the other PreformanceTestResults into the first dictionary."""
+    for test, result in other_results.items():
+        results[test].merge(result)
+    return results
 
-        iter += 1
+
+def test_performance(opt_level, old_dir, new_dir, threshold, num_samples,
+                     output_file):
+    """Detect performance changes in benchmarks.
+
+    Start fast with few samples per benchmark and gradually spend more time
+    gathering more precise measurements of the change candidates.
+    """
+
+    i, unchanged_length_count = 0, 0
+    old, new = [BenchmarkDriver(DriverArgs(dir, optimization=opt_level))
+                for dir in [old_dir, new_dir]]
+    results = [measure(driver, driver.tests, i) for driver in [old, new]]
+    tests = TestComparator(results[0], results[1], threshold)
+    changed = tests.decreased + tests.increased
+
+    while len(changed) > 0 and unchanged_length_count < 5:
+        i += 1
+        if VERBOSE:
+            log('        test again: ' + str([test.name for test in changed]))
+        results = [merge(the_results,
+                         measure(driver, [test.name for test in changed], i))
+                   for the_results, driver in zip(results, [old, new])]
+        tests = TestComparator(results[0], results[1], threshold)
+        changed = tests.decreased + tests.increased
+
+        if len(old.tests) == len(changed):
+            unchanged_length_count += 1
+        else:
+            unchanged_length_count = 0
 
     log('')
-    return report_results("Performance: -" + opt_level,
-                          old_lines, new_lines, threshold * 1.4, output_file)
+    return report_results("Performance: -" + opt_level, None, None,
+                          threshold * 1.4, output_file, *results)
 
 
 def get_results(bench_dir, opt_level, num_samples, to_test):
@@ -274,9 +265,10 @@ def get_codesize(filename):
     return int(data_line.split('\t')[0])
 
 
-def report_results(title, old_lines, new_lines, threshold, output_file):
-    old_results = LogParser.results_from_string(old_lines)
-    new_results = LogParser.results_from_string(new_lines)
+def report_results(title, old_lines, new_lines, threshold, output_file,
+                   old_results=None, new_results=None):
+    old_results = old_results or LogParser.results_from_string(old_lines)
+    new_results = new_results or LogParser.results_from_string(new_lines)
 
     print("------- " + title + " -------")
     print(create_report(old_results, new_results, threshold, 'git'))
@@ -332,25 +324,7 @@ performance team (@eeckstein).
     return text
 
 
-class DriverArgs(object):
-    def __init__(self, tests):
-        self.benchmarks = None
-        self.filters = None
-        self.tests = os.path.join(tests, 'bin')
-        self.optimization = 'O'
-
-
 def check_added(args, output_file=None):
-    from imp import load_source
-    # import Benchmark_Driver  # doesn't work because it misses '.py' extension
-    Benchmark_Driver = load_source(
-        'Benchmark_Driver', os.path.join(os.path.dirname(
-            os.path.abspath(__file__)), 'Benchmark_Driver'))
-    # from Benchmark_Driver import BenchmarkDriver, BenchmarkDoctor
-    BenchmarkDriver = Benchmark_Driver.BenchmarkDriver
-    BenchmarkDoctor = Benchmark_Driver.BenchmarkDoctor
-    MarkdownReportHandler = Benchmark_Driver.MarkdownReportHandler
-
     old = BenchmarkDriver(DriverArgs(args.oldbuilddir[0]))
     new = BenchmarkDriver(DriverArgs(args.newbuilddir[0]))
     added = set(new.tests).difference(set(old.tests))
diff --git a/benchmark/scripts/test_Benchmark_Driver.py b/benchmark/scripts/test_Benchmark_Driver.py
@@ -181,14 +181,16 @@ def test_test_harness(self):
     def test_gets_list_of_precommit_benchmarks(self):
         self.subprocess_mock.expect(
             '/benchmarks/Benchmark_O --list --delim=\t'.split(' '),
-            '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n1\tBenchmark2\t[t3]\n')
+            '#\tTest\t[Tags]\n1\tBenchmark1\t[t1, t2]\n2\tBenchmark2\t[t3]\n')
         driver = BenchmarkDriver(
             self.args, _subprocess=self.subprocess_mock)
         self.subprocess_mock.assert_called_all_expected()
         self.assertEquals(driver.tests,
                           ['Benchmark1', 'Benchmark2'])
         self.assertEquals(driver.all_tests,
                           ['Benchmark1', 'Benchmark2'])
+        self.assertEquals(driver.test_number['Benchmark1'], "1")
+        self.assertEquals(driver.test_number['Benchmark2'], "2")
 
     list_all_tests = (
         '/benchmarks/Benchmark_O --list --delim=\t --skip-tags='.split(' '),
@@ -281,14 +283,39 @@ def test_run_benchmark_with_specified_number_of_iterations(self):
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b', '--num-iters=1'))
 
+    def test_run_benchmark_for_specified_time(self):
+        self.driver.run('b', sample_time=0.5)
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', 'b', '--sample-time=0.5'))
+
     def test_run_benchmark_in_verbose_mode(self):
         self.driver.run('b', verbose=True)
         self.subprocess_mock.assert_called_with(
             ('/benchmarks/Benchmark_O', 'b', '--verbose'))
 
+    def test_run_batch(self):
+        """Run all active tests in a single execution of the Benchmark_X.
+
+        Known test names are passed to the harness in a compressed form as test
+        numbers.
+        """
+        self.driver.tests = ['b1', 'bx']
+        self.driver.run()
+        self.subprocess_mock.assert_called_with(
+            ('/benchmarks/Benchmark_O', '1', 'bx'))
+
     def test_parse_results_from_running_benchmarks(self):
-        self.driver.run('b')
+        """Parse measurements results using LogParser.
+
+        Individual test run returns the first PerformanceTestResult directly.
+        Batch run returns the dictionary of PerformanceTestResults.
+        """
+        r = self.driver.run('b')
         self.assertTrue(self.parser_stub.results_from_string_called)
+        self.assertEquals(r.name, 'b1')  # non-matching name, just 1st result
+        r = self.driver.run()
+        self.assertTrue(isinstance(r, dict))
+        self.assertEquals(r['b1'].name, 'b1')
 
     def test_measure_memory(self):
         self.driver.run('b', measure_memory=True)