change markdown output in benchmark PR comments

EuphoricThinking · EuphoricThinking · commit 664a3ae395c0 · 2025-02-17T16:50:16.000+01:00
add an option for limiting markdown content size
calculate relative performance with different baselines
calculate relative performance using only already saved data
group results according to suite names and explicit groups
add multiple data columns if multiple --compare specified
diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
@@ -220,11 +220,12 @@ jobs:
         --compute-runtime ${{ inputs.compute_runtime_commit }}
         --build-igc
         ${{ inputs.upload_report && '--output-html' || '' }}
+        ${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
         ${{ inputs.bench_script_params }}
 
     - name: Print benchmark results
       run: |
-        cat ${{ github.workspace }}/ur-repo/benchmark_results.md
+        cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true
 
     - name: Add comment to PR
       uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -24,7 +24,7 @@ on:
         type: number
         required: true
       bench_script_params:
-        description: Parameters passed to script executing benchmark
+        description: Parameters passed to the script executing benchmark (recommended `--compare baseline`)
         type: string
         required: false
         default: ''
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
@@ -27,21 +27,22 @@ You can also include additional benchmark parameters, such as environment variab
 
 Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request.
 
-By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data.
+It is recommended that all benchmark runs should be compared against `baseline` by passing `--compare baseline` to benchmark parameters. `baseline` is a well-established set of the latest data.
 
 You must be a member of the `oneapi-src` organization to access these features.
 
 ## Comparing results
 
 By default, the benchmark results are not stored. To store them, use the option `--save <name>`. This will make the results available for comparison during the next benchmark runs.
 
-To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.
-
-If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.
+You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis.
 
 Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
 are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
 
+## Output formats
+You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
+
 ## Requirements
 
 ### Python
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
@@ -189,25 +189,28 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         benchmark.teardown()
         print("complete.")
 
-    this_name = "This PR"
 
-    chart_data = {this_name : results}
+    this_name = options.current_run_name
+    chart_data = {}
+
+    if not options.dry_run:
+        chart_data = {this_name : results}
 
     history = BenchmarkHistory(directory)
     # limit how many files we load.
     # should this be configurable?
     history.load(1000)
 
     # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
-    compare_names = list(dict.fromkeys(compare_names))
+    compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else []
 
     for name in compare_names:
         compare_result = history.get_compare(name)
         if compare_result:
             chart_data[name] = compare_result.results
 
     if options.output_markdown:
-        markdown_content = generate_markdown(this_name, chart_data)
+        markdown_content = generate_markdown(this_name, chart_data, options.output_markdown)
 
         with open('benchmark_results.md', 'w') as file:
             file.write(markdown_content)
@@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
-    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append")
     parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
     parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold)
     parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
@@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
     parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
     parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
+    parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation')
     parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
-    parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
     parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
     parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime")
     parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev)
     parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc)
+    parser.add_argument("--relative-perf",  type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name)
 
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
@@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args):
     options.exit_on_failure = args.exit_on_failure
     options.compare = Compare(args.compare_type)
     options.compare_max = args.compare_max
-    options.output_html = args.output_html
     options.output_markdown = args.output_markdown
+    options.output_html = args.output_html
     options.dry_run = args.dry_run
     options.umf = args.umf
     options.iterations_stddev = args.iterations_stddev
     options.build_igc = args.build_igc
+    options.current_run_name = args.relative_perf
 
     if args.build_igc and args.compute_runtime is None:
         parser.error("--build-igc requires --compute-runtime to be set")
diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py
@@ -6,6 +6,10 @@ class Compare(Enum):
     AVERAGE = 'average'
     MEDIAN = 'median'
 
+class MarkdownSize(Enum):
+    SHORT = 'short'
+    FULL = 'full'
+
 @dataclass
 class Options:
     workdir: str = None
@@ -20,8 +24,8 @@ class Options:
     verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10 # average/median over how many results
+    output_markdown: MarkdownSize = MarkdownSize.SHORT
     output_html: bool = False
-    output_markdown: bool = True
     dry_run: bool = False
     # these two should probably be merged into one setting
     stddev_threshold: float = 0.02
@@ -32,6 +36,7 @@ class Options:
     extra_env_vars: dict = field(default_factory=dict)
     compute_runtime_tag: str = '24.52.32224.10'
     build_igc: bool = False
+    current_run_name: str = "This PR"
 
 options = Options()
 
diff --git a/scripts/benchmarks/output_markdown.py b/scripts/benchmarks/output_markdown.py