Skip to content

Commit 0d080f8

Browse files
authored
Merge pull request #2693 from EuphoricThinking/benchmark_markdown
change markdown output in benchmark PR comments
2 parents 601f60c + 664a3ae commit 0d080f8

File tree

6 files changed

+334
-142
lines changed

6 files changed

+334
-142
lines changed

.github/workflows/benchmarks-reusable.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,11 +220,12 @@ jobs:
220220
--compute-runtime ${{ inputs.compute_runtime_commit }}
221221
--build-igc
222222
${{ inputs.upload_report && '--output-html' || '' }}
223+
${{ inputs.pr_no != 0 && '--output-markdown' || '' }}
223224
${{ inputs.bench_script_params }}
224225
225226
- name: Print benchmark results
226227
run: |
227-
cat ${{ github.workspace }}/ur-repo/benchmark_results.md
228+
cat ${{ github.workspace }}/ur-repo/benchmark_results.md || true
228229
229230
- name: Add comment to PR
230231
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1

.github/workflows/benchmarks.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ on:
2424
type: number
2525
required: true
2626
bench_script_params:
27-
description: Parameters passed to script executing benchmark
27+
description: Parameters passed to the script executing benchmark (recommended `--compare baseline`)
2828
type: string
2929
required: false
3030
default: ''

scripts/benchmarks/README.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,21 +27,22 @@ You can also include additional benchmark parameters, such as environment variab
2727

2828
Once all the required information is entered, click the "Run workflow" button to initiate a new workflow run. This will execute the benchmarks and then post the results as a comment on the specified Pull Request.
2929

30-
By default, all benchmark runs are compared against `baseline`, which is a well-established set of the latest data.
30+
It is recommended that all benchmark runs should be compared against `baseline` by passing `--compare baseline` to benchmark parameters. `baseline` is a well-established set of the latest data.
3131

3232
You must be a member of the `oneapi-src` organization to access these features.
3333

3434
## Comparing results
3535

3636
By default, the benchmark results are not stored. To store them, use the option `--save <name>`. This will make the results available for comparison during the next benchmark runs.
3737

38-
To compare a benchmark run with a previously stored result, use the option `--compare <name>`. You can compare with more than one result.
39-
40-
If no `--compare` option is specified, the benchmark run is compared against a previously stored `baseline`.
38+
You can compare benchmark results using `--compare` option. The comparison will be presented in a markdown output file (see below). If you want to calculate the relative performance of the new results against the previously saved data, use `--compare <previously_saved_data>` (i.e. `--compare baseline`). In case of comparing only stored data without generating new results, use `--dry-run --compare <name1> --compare <name2> --relative-perf <name1>`, where `name1` indicates the baseline for the relative performance calculation and `--dry-run` prevents the script for running benchmarks. Listing more than two `--compare` options results in displaying only execution time, without statistical analysis.
4139

4240
Baseline, as well as baseline-v2 (for the level-zero adapter v2) is updated automatically during a nightly job. The results
4341
are stored [here](https://oneapi-src.github.io/unified-runtime/benchmark_results.html).
4442

43+
## Output formats
44+
You can display the results in the form of a HTML file by using `--ouptut-html` and a markdown file by using `--output-markdown`. Due to character limits for posting PR comments, the final content of the markdown file might be reduced. In order to obtain the full markdown output, use `--output-markdown full`.
45+
4546
## Requirements
4647

4748
### Python

scripts/benchmarks/main.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -189,25 +189,28 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
189189
benchmark.teardown()
190190
print("complete.")
191191

192-
this_name = "This PR"
193192

194-
chart_data = {this_name : results}
193+
this_name = options.current_run_name
194+
chart_data = {}
195+
196+
if not options.dry_run:
197+
chart_data = {this_name : results}
195198

196199
history = BenchmarkHistory(directory)
197200
# limit how many files we load.
198201
# should this be configurable?
199202
history.load(1000)
200203

201204
# remove duplicates. this can happen if e.g., --compare baseline is specified manually.
202-
compare_names = list(dict.fromkeys(compare_names))
205+
compare_names = list(dict.fromkeys(compare_names)) if compare_names is not None else []
203206

204207
for name in compare_names:
205208
compare_result = history.get_compare(name)
206209
if compare_result:
207210
chart_data[name] = compare_result.results
208211

209212
if options.output_markdown:
210-
markdown_content = generate_markdown(this_name, chart_data)
213+
markdown_content = generate_markdown(this_name, chart_data, options.output_markdown)
211214

212215
with open('benchmark_results.md', 'w') as file:
213216
file.write(markdown_content)
@@ -251,7 +254,7 @@ def validate_and_parse_env_args(env_args):
251254
parser.add_argument("--no-rebuild", help='Do not rebuild the benchmarks from scratch.', action="store_true")
252255
parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
253256
parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
254-
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
257+
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append")
255258
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
256259
parser.add_argument("--stddev-threshold", type=float, help='If stddev pct is above this threshold, rerun all iterations', default=options.stddev_threshold)
257260
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
@@ -261,12 +264,13 @@ def validate_and_parse_env_args(env_args):
261264
parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
262265
parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
263266
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
267+
parser.add_argument("--output-markdown", nargs='?', const=options.output_markdown, help='Specify whether markdown output should fit the content size limit for request validation')
264268
parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
265-
parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
266269
parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
267270
parser.add_argument("--compute-runtime", nargs='?', const=options.compute_runtime_tag, help="Fetch and build compute runtime")
268271
parser.add_argument("--iterations-stddev", type=int, help="Max number of iterations of the loop calculating stddev after completed benchmark runs", default=options.iterations_stddev)
269272
parser.add_argument("--build-igc", help="Build IGC from source instead of using the OS-installed version", action="store_true", default=options.build_igc)
273+
parser.add_argument("--relative-perf", type=str, help="The name of the results which should be used as a baseline for metrics calculation", default=options.current_run_name)
270274

271275
args = parser.parse_args()
272276
additional_env_vars = validate_and_parse_env_args(args.env)
@@ -283,12 +287,13 @@ def validate_and_parse_env_args(env_args):
283287
options.exit_on_failure = args.exit_on_failure
284288
options.compare = Compare(args.compare_type)
285289
options.compare_max = args.compare_max
286-
options.output_html = args.output_html
287290
options.output_markdown = args.output_markdown
291+
options.output_html = args.output_html
288292
options.dry_run = args.dry_run
289293
options.umf = args.umf
290294
options.iterations_stddev = args.iterations_stddev
291295
options.build_igc = args.build_igc
296+
options.current_run_name = args.relative_perf
292297

293298
if args.build_igc and args.compute_runtime is None:
294299
parser.error("--build-igc requires --compute-runtime to be set")

scripts/benchmarks/options.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ class Compare(Enum):
66
AVERAGE = 'average'
77
MEDIAN = 'median'
88

9+
class MarkdownSize(Enum):
10+
SHORT = 'short'
11+
FULL = 'full'
12+
913
@dataclass
1014
class Options:
1115
workdir: str = None
@@ -20,8 +24,8 @@ class Options:
2024
verbose: bool = False
2125
compare: Compare = Compare.LATEST
2226
compare_max: int = 10 # average/median over how many results
27+
output_markdown: MarkdownSize = MarkdownSize.SHORT
2328
output_html: bool = False
24-
output_markdown: bool = True
2529
dry_run: bool = False
2630
# these two should probably be merged into one setting
2731
stddev_threshold: float = 0.02
@@ -32,6 +36,7 @@ class Options:
3236
extra_env_vars: dict = field(default_factory=dict)
3337
compute_runtime_tag: str = '24.52.32224.10'
3438
build_igc: bool = False
39+
current_run_name: str = "This PR"
3540

3641
options = Options()
3742

0 commit comments

Comments
 (0)