Skip to content

Commit 067b4cc

Browse files
authored
Upstream libc++ buildbot restarter. (#93582)
I've been running a cronjob on my local machine to restart preempted libc++ CI runs. This is bad and brittle. This upstreams a much better version of the restarter. It works by matching on check run annotations looking for mention of the machine being shutdown. If there are both preempted jobs and failing jobs, we don't restart the workflow. Maybe we should change that?
1 parent 765206e commit 067b4cc

File tree

1 file changed

+109
-0
lines changed

1 file changed

+109
-0
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
name: Restart Preempted Libc++ Workflow
2+
3+
# The libc++ builders run on preemptable VMs, which can be shutdown at any time.
4+
# This workflow identifies when a workflow run was canceled due to the VM being preempted,
5+
# and restarts the workflow run.
6+
7+
# We identify a canceled workflow run by checking the annotations of the check runs in the check suite,
8+
# which should contain the message "The runner has received a shutdown signal."
9+
10+
# Note: If a job is both preempted and also contains a non-preemption failure, we do not restart the workflow.
11+
12+
on:
13+
workflow_run:
14+
workflows:
15+
- "Build and Test libc\+\+"
16+
types:
17+
- failure
18+
- canceled
19+
20+
permissions:
21+
contents: read
22+
23+
jobs:
24+
restart:
25+
if: github.repository_owner == 'llvm'
26+
name: "Restart Job"
27+
permissions:
28+
statuses: read
29+
checks: read
30+
actions: write
31+
runs-on: ubuntu-latest
32+
steps:
33+
- name: "Restart Job"
34+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
35+
with:
36+
script: |
37+
const failure_regex = /Process completed with exit code 1./
38+
const preemption_regex = /The runner has received a shutdown signal/
39+
40+
console.log('Listing check runs for suite')
41+
const check_suites = await github.rest.checks.listForSuite({
42+
owner: context.repo.owner,
43+
repo: context.repo.repo,
44+
check_suite_id: context.payload.workflow_run.check_suite_id
45+
})
46+
47+
check_run_ids = [];
48+
for (check_run of check_suites.data.check_runs) {
49+
console.log('Checking check run: ' + check_run.id);
50+
console.log(check_run);
51+
if (check_run.status != 'completed') {
52+
console.log('Check run was not completed. Skipping.');
53+
continue;
54+
}
55+
if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
56+
console.log('Check run had conclusion: ' + check_run.conclusion + '. Skipping.');
57+
continue;
58+
}
59+
check_run_ids.push(check_run.id);
60+
}
61+
62+
has_preempted_job = false;
63+
64+
for (check_run_id of check_run_ids) {
65+
console.log('Listing annotations for check run: ' + check_run_id);
66+
67+
annotations = await github.rest.checks.listAnnotations({
68+
owner: context.repo.owner,
69+
repo: context.repo.repo,
70+
check_run_id: check_run_id
71+
})
72+
73+
console.log(annotations);
74+
for (annotation of annotations.data) {
75+
if (annotation.annotation_level != 'failure') {
76+
continue;
77+
}
78+
79+
const preemption_match = annotation.message.match(preemption_regex);
80+
81+
if (preemption_match != null) {
82+
console.log('Found preemption message: ' + annotation.message);
83+
has_preempted_job = true;
84+
}
85+
86+
const failure_match = annotation.message.match(failure_regex);
87+
if (failure_match != null) {
88+
// We only want to restart the workflow if all of the failures were due to preemption.
89+
// We don't want to restart the workflow if there were other failures.
90+
console.log('Choosing not to rerun workflow because we found a non-preemption failure');
91+
console.log('Failure message: ' + annotation.message);
92+
return;
93+
}
94+
}
95+
}
96+
97+
if (!has_preempted_job) {
98+
console.log('No preempted jobs found. Not restarting workflow.');
99+
return;
100+
}
101+
102+
console.log("Restarted workflow: " + context.payload.workflow_run.id);
103+
await github.rest.actions.reRunWorkflowFailedJobs({
104+
owner: context.repo.owner,
105+
repo: context.repo.repo,
106+
run_id: context.payload.workflow_run.id
107+
})
108+
109+

0 commit comments

Comments
 (0)