Skip to content

Commit 26e3233

Browse files
authored
Retry test failures on Unity (#1028)
* Add retry workflow * Add script for retrying tests under any conditions. * Add to integration test workflow (currently unconditional, will change to scheduled trigger only when merging.) * Temporarily enable retry on cancelled workflow. * Duplicate the C++ trigger workflow script. * Test with a forced-failing test. * Put the workflow back the way it should be for merging. * Remove unused branch sha check.
1 parent c0e9f70 commit 26e3233

File tree

5 files changed

+252
-9
lines changed

5 files changed

+252
-9
lines changed

.github/workflows/integration_tests.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -644,3 +644,35 @@ jobs:
644644
--run_id ${{github.run_id}}
645645
- name: Summarize results into GitHub log
646646
run: python scripts/gha/summarize_test_results.py --dir test_results --github_log
647+
648+
649+
attempt_retry:
650+
name: "attempt-retry"
651+
needs: [check_and_prepare, summarize_results]
652+
runs-on: ubuntu-20.04
653+
if: ${{ failure() && needs.check_and_prepare.outputs.trigger == 'scheduled_trigger' }}
654+
steps:
655+
- uses: actions/checkout@v3
656+
with:
657+
ref: ${{needs.check_and_prepare.outputs.github_ref}}
658+
- name: Setup python
659+
uses: actions/setup-python@v4
660+
with:
661+
python-version: ${{ env.pythonVersion }}
662+
- name: Install python deps
663+
run: pip install -r scripts/gha/requirements.txt
664+
# The default token can't run workflows, so get an alternate token.
665+
- name: Generate token for GitHub API
666+
uses: tibdex/github-app-token@v1
667+
id: generate-token
668+
with:
669+
app_id: ${{ secrets.WORKFLOW_TRIGGER_APP_ID }}
670+
private_key: ${{ secrets.WORKFLOW_TRIGGER_APP_PRIVATE_KEY }}
671+
- name: Retry failed tests
672+
run: |
673+
echo "::warning ::Attempting to retry failed tests"
674+
python scripts/gha/trigger_workflow.py -t ${{ steps.generate-token.outputs.token }} \
675+
-w retry-test-failures.yml \
676+
-p run_id ${{ github.run_id }} \
677+
-s 10 \
678+
-A

.github/workflows/retry-test-failures.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,31 @@ jobs:
1010
check_results_and_retry_if_needed:
1111
name: check-results-and-retry-if-needed
1212
runs-on: ubuntu-20.04
13-
if:
1413
steps:
15-
- name: No-op
16-
run: true
14+
- name: Get token for firebase-workflow-trigger
15+
uses: tibdex/github-app-token@v1
16+
id: generate-token
17+
with:
18+
app_id: ${{ secrets.WORKFLOW_TRIGGER_APP_ID }}
19+
private_key: ${{ secrets.WORKFLOW_TRIGGER_APP_PRIVATE_KEY }}
20+
21+
- name: Setup python
22+
uses: actions/setup-python@v4
23+
with:
24+
python-version: 3.8
25+
26+
- uses: actions/checkout@v3
27+
with:
28+
fetch-depth: 0
29+
submodules: false
30+
31+
- name: Wait 3 minutes for run to finish
32+
run: |
33+
sleep 180
34+
35+
- name: Install python deps
36+
run: pip install -r scripts/gha/requirements.txt
37+
38+
- name: Run test failure retry script
39+
run: |
40+
python scripts/gha/retry_test_failures.py --token '${{ steps.generate-token.outputs.token }}' --run_id '${{ github.event.inputs.run_id }}'

scripts/gha/firebase_github.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,61 @@ def list_workflow_runs(token, workflow_id, branch=None, event=None, limit=200):
310310
keep_going = False
311311
results = results[:limit]
312312
return results
313+
314+
315+
def list_jobs_for_workflow_run(token, run_id, attempt=None, limit=200):
316+
"""https://docs.github.com/en/rest/actions/workflow-jobs#list-jobs-for-a-workflow-run
317+
https://docs.github.com/en/rest/actions/workflow-jobs#list-jobs-for-a-workflow-run-attempt
318+
319+
Args:
320+
attempt: Which attempt to fetch. Should be a number >0, 'latest', or 'all'.
321+
If unspecified, returns 'latest'.
322+
"""
323+
if attempt == 'latest' or attempt== 'all' or attempt == None:
324+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/jobs'
325+
else:
326+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/attempts/{attempt}/jobs'
327+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
328+
page = 1
329+
per_page = 100
330+
results = []
331+
keep_going = True
332+
while keep_going:
333+
params = {'per_page': per_page, 'page': page}
334+
if attempt == 'latest' or attempt == 'all':
335+
params.update({'filter': attempt})
336+
page = page + 1
337+
keep_going = False
338+
with requests_retry_session().get(url, headers=headers, params=params,
339+
stream=True, timeout=TIMEOUT) as response:
340+
logging.info("list_jobs_for_workflow_run: %s page %d, response: %s",
341+
url, params['page'], response)
342+
if 'jobs' not in response.json():
343+
break
344+
job_results = response.json()['jobs']
345+
results = results + job_results
346+
# If exactly per_page results were retrieved, read the next page.
347+
keep_going = (len(job_results) == per_page)
348+
if limit > 0 and len(results) >= limit:
349+
keep_going = False
350+
results = results[:limit]
351+
return results
352+
353+
354+
def download_job_logs(token, job_id):
355+
"""https://docs.github.com/en/rest/actions/workflow-jobs#download-job-logs-for-a-workflow-run"""
356+
url = f'{GITHUB_API_URL}/actions/jobs/{job_id}/logs'
357+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
358+
with requests_retry_session().get(url, headers=headers, stream=True, timeout=TIMEOUT) as response:
359+
logging.info("download_job_logs: %s response: %s", url, response)
360+
return response.content.decode('utf-8')
361+
362+
363+
def rerun_failed_jobs_for_workflow_run(token, run_id):
364+
"""https://docs.github.com/en/rest/actions/workflow-runs#re-run-failed-jobs-from-a-workflow-run"""
365+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/rerun-failed-jobs'
366+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
367+
with requests.post(url, headers=headers,
368+
stream=True, timeout=TIMEOUT) as response:
369+
logging.info("rerun_failed_jobs_for_workflow_run: %s response: %s", url, response)
370+
return True if response.status_code == 201 else False

scripts/gha/retry_test_failures.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""A utility to retry failed jobs in a workflow run.
16+
17+
USAGE:
18+
python3 scripts/gha/retry_test_failures.py \
19+
--token ${{github.token}} \
20+
--run_id <github_workflow_run_id>
21+
"""
22+
23+
import datetime
24+
import json
25+
import re
26+
import shutil
27+
28+
from absl import app
29+
from absl import flags
30+
from absl import logging
31+
32+
import firebase_github
33+
34+
FLAGS = flags.FLAGS
35+
MAX_RETRIES=2
36+
37+
flags.DEFINE_string(
38+
"token", None,
39+
"github.token: A token to authenticate on your repository.")
40+
41+
flags.DEFINE_string(
42+
"run_id", None,
43+
"Github's workflow run ID.")
44+
45+
46+
def get_log_group(log_text, group_name):
47+
group_log = []
48+
in_group = False
49+
for line in log_text.split("\n"):
50+
line_no_ts = line[29:]
51+
if line_no_ts.startswith('##[group]'):
52+
if group_name in line_no_ts:
53+
print("got group %s" % group_name)
54+
in_group = True
55+
if in_group:
56+
group_log.append(line_no_ts)
57+
if line_no_ts.startswith('##[error])'):
58+
print("end group %s" % group_name)
59+
in_group = False
60+
break
61+
return group_log
62+
63+
def main(argv):
64+
if len(argv) > 1:
65+
raise app.UsageError("Too many command-line arguments.")
66+
# Get list of workflow jobs.
67+
workflow_jobs = firebase_github.list_jobs_for_workflow_run(
68+
FLAGS.token, FLAGS.run_id, attempt='all')
69+
if not workflow_jobs or len(workflow_jobs) == 0:
70+
logging.error("No jobs found for workflow run %s", FLAGS.run_id)
71+
exit(1)
72+
73+
failed_jobs = {}
74+
all_jobs = {}
75+
for job in workflow_jobs:
76+
all_jobs[job['id']] = job
77+
if job['conclusion'] != 'success' and job['conclusion'] != 'skipped':
78+
if job['name'] in failed_jobs:
79+
other_run = failed_jobs[job['name']]
80+
if job['run_attempt'] > other_run['run_attempt']:
81+
# This is a later run than the one that's already there
82+
failed_jobs[job['name']] = job
83+
else:
84+
failed_jobs[job['name']] = job
85+
86+
should_rerun_jobs = False
87+
for job_name in failed_jobs:
88+
job = failed_jobs[job_name]
89+
logging.info('Considering job %s attempt %d: %s (%s)',
90+
job['conclusion'] if job['conclusion'] else job['status'],
91+
job['run_attempt'], job['name'], job['id'])
92+
if job['status'] != 'completed':
93+
# Don't retry a job that is already in progress or queued
94+
logging.info("Not retrying, as %s is already %s",
95+
job['name'], job['status'].replace("_", " "))
96+
should_rerun_jobs = False
97+
break
98+
if job['run_attempt'] > MAX_RETRIES:
99+
# Don't retry a job more than MAX_RETRIES times.
100+
logging.info("Not retrying, as %s has already been attempted %d times",
101+
job['name'], job['run_attempt'])
102+
should_rerun_jobs = False
103+
break
104+
if job['conclusion'] == 'failure':
105+
job_logs = firebase_github.download_job_logs(FLAGS.token, job['id'])
106+
if job['name'].startswith('build-'):
107+
# Retry build jobs that timed out
108+
if re.search(r'timed? ?out|network error|maximum execution time',
109+
job_logs, re.IGNORECASE):
110+
should_rerun_jobs = True
111+
elif job['name'].startswith('test-'):
112+
# Tests should always be retried (for now).
113+
should_rerun_jobs = True
114+
115+
if should_rerun_jobs:
116+
logging.info("Re-running failed jobs in workflow run %s", FLAGS.run_id)
117+
if not firebase_github.rerun_failed_jobs_for_workflow_run(
118+
FLAGS.token, FLAGS.run_id):
119+
logging.error("Error submitting GitHub API request")
120+
exit(1)
121+
else:
122+
logging.info("Not re-running jobs.")
123+
124+
125+
if __name__ == "__main__":
126+
flags.mark_flag_as_required("token")
127+
flags.mark_flag_as_required("run_id")
128+
app.run(main)

scripts/gha/trigger_workflow.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
https://docs.github.com/en/rest/reference/actions#create-a-workflow-dispatch-event
2121
2222
Usage:
23-
python trigger_workflow.py -w workflow_filename -t github_token [-b branch_name]
23+
python3 trigger_workflow.py -w workflow_filename -t github_token [-b branch_name]
2424
[-r git_repo_url] [-p <input1> <value1> -p <input2> <value2> ...]'
2525
[-C curl_command]
2626
@@ -38,6 +38,7 @@ def main():
3838
args = parse_cmdline_args()
3939
if args.branch is None:
4040
args.branch=subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode('utf-8').rstrip('\n')
41+
if args.branch == 'HEAD': args.branch = 'main'
4142
print('autodetected branch: %s' % args.branch)
4243
if args.repo: # else use default firebase/firebase-unity-sdk repo
4344
if not firebase_github.set_repo_url(args.repo):
@@ -52,13 +53,13 @@ def main():
5253
print(f'request_url: {firebase_github.GITHUB_API_URL}/actions/workflows/{args.workflow}/dispatches')
5354
print(f'request_body: ref: {args.branch}, inputs: {json_params}')
5455
if args.dryrun:
55-
return(0)
56+
exit(0)
5657

5758
print('Sending request to GitHub API...')
5859
if not firebase_github.create_workflow_dispatch(args.token, args.workflow, args.branch, json_params):
5960
print('%sFailed to trigger workflow %s' % (
6061
'::error ::' if args.in_github_action else '', args.workflow))
61-
return(-1)
62+
exit(1)
6263

6364
print('Success!')
6465
time.sleep(args.sleep) # Give a few seconds for the job to become queued.
@@ -69,7 +70,7 @@ def main():
6970
if "workflow_runs" in workflows:
7071
for workflow in workflows['workflow_runs']:
7172
# Use a heuristic to get the new workflow's run ID.
72-
# Must match the branch name, and be queued/in progress.
73+
# Must match the branch name and be queued/in progress.
7374
if (workflow['status'] in ('queued', 'in_progress') and
7475
workflow['head_branch'] == args.branch):
7576
run_id = workflow['id']
@@ -79,8 +80,8 @@ def main():
7980
workflow_url = 'https://github.com/firebase/firebase-unity-sdk/actions/runs/%s' % (run_id)
8081
else:
8182
# Couldn't get a run ID, use a generic URL.
82-
workflow_url = '/%s/actions/workflows/%s?query=%s+%s' % (
83-
firebase_github.GITHUB_API_URL, args.workflow,
83+
workflow_url = '%s/actions/workflows/%s?query=%s+%s' % (
84+
'https://github.com/firebase/firebase-unity-sdk', args.workflow,
8485
urllib.parse.quote('event:workflow_dispatch', safe=''),
8586
urllib.parse.quote('branch:'+args.branch, safe=''))
8687
print('%sStarted workflow %s: %s' % ('::warning ::' if args.in_github_action else '',

0 commit comments

Comments
 (0)