Skip to content

Commit 842d563

Browse files
committed
[libc++][CI] Also restart failed jobs when they fail for a spurious reason
Since we moved to a Docker-in-Docker setup, CI jobs sometimes fail due to the Docker VM dying with 'context cancelled' errors. This is currently not recognized as a spurious failure, which leads to the job not being automatically restarted. This patch fixes that. This commit only adds a test job with the new logic since this workflow triggers on workflow_run, which means that the changes need to be on `main` before they can be tested. Once this is tested to work properly, I'll make it the default definition for the workflow.
1 parent e51a0b2 commit 842d563

File tree

1 file changed

+88
-0
lines changed

1 file changed

+88
-0
lines changed

.github/workflows/libcxx-restart-preempted-jobs.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,91 @@ jobs:
130130
run_id: context.payload.workflow_run.id
131131
})
132132
await create_check_run('success', 'Restarted workflow run due to preempted job')
133+
134+
restart-test:
135+
if: github.repository_owner == 'llvm' && (github.event.workflow_run.conclusion == 'failure' || github.event.workflow_run.conclusion == 'cancelled') && github.event.actor.login == 'ldionne' # TESTING ONLY
136+
name: "Restart Job"
137+
permissions:
138+
statuses: read
139+
checks: write
140+
actions: write
141+
runs-on: ubuntu-latest
142+
steps:
143+
- name: "Restart Job"
144+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1
145+
with:
146+
script: |
147+
const FAILURE_REGEX = /Process completed with exit code 1./
148+
const PREEMPTION_REGEX = /The runner has received a shutdown signal|The operation was canceled/
149+
150+
function log(msg) {
151+
core.notice(msg)
152+
}
153+
154+
const wf_run = context.payload.workflow_run
155+
log(`Running on "${wf_run.display_title}" by @${wf_run.actor.login} (event: ${wf_run.event})\nWorkflow run URL: ${wf_run.html_url}`)
156+
157+
log('Listing check runs for suite')
158+
const check_suites = await github.rest.checks.listForSuite({
159+
owner: context.repo.owner,
160+
repo: context.repo.repo,
161+
check_suite_id: context.payload.workflow_run.check_suite_id,
162+
per_page: 100 // FIXME: We don't have 100 check runs yet, but we should handle this better.
163+
})
164+
165+
preemptions = [];
166+
legitimate_failures = [];
167+
for (check_run of check_suites.data.check_runs) {
168+
log(`Checking check run: ${check_run.id}`);
169+
if (check_run.status != 'completed') {
170+
log('Check run was not completed. Skipping.');
171+
continue;
172+
}
173+
174+
if (check_run.conclusion != 'failure' && check_run.conclusion != 'cancelled') {
175+
log(`Check run had conclusion: ${check_run.conclusion}. Skipping.`);
176+
continue;
177+
}
178+
179+
annotations = await github.rest.checks.listAnnotations({
180+
owner: context.repo.owner,
181+
repo: context.repo.repo,
182+
check_run_id: check_run.id
183+
})
184+
185+
preemption_annotation = annotations.data.find(function(annotation) {
186+
return annotation.annotation_level == 'failure' &&
187+
annotation.message.match(PREEMPTION_REGEX) != null;
188+
});
189+
if (preemption_annotation != null) {
190+
log(`Found preemption message: ${preemption_annotation.message}`);
191+
preemptions.push(check_run);
192+
break;
193+
}
194+
195+
failure_annotation = annotations.data.find(function(annotation) {
196+
return annotation.annotation_level == 'failure' &&
197+
annotation.message.match(FAILURE_REGEX) != null;
198+
});
199+
if (failure_annotation != null) {
200+
log(`Found legitimate failure annotation: ${failure_annotation.message}`);
201+
legitimate_failures.push(check_run);
202+
break;
203+
}
204+
}
205+
206+
if (preemptions) {
207+
log('Found some preempted jobs');
208+
if (legitimate_failures) {
209+
log('Also found some legitimate failures, so not restarting the workflow.');
210+
} else {
211+
log('Did not find any legitimate failures. Restarting workflow.');
212+
await github.rest.actions.reRunWorkflowFailedJobs({
213+
owner: context.repo.owner,
214+
repo: context.repo.repo,
215+
run_id: context.payload.workflow_run.id
216+
})
217+
}
218+
} else {
219+
log('Did not find any preempted jobs. Not restarting the workflow.');
220+
}

0 commit comments

Comments
 (0)