Skip to content

Commit 60f48db

Browse files
committed
Add script for retrying tests under any conditions.
1 parent 35bb3f9 commit 60f48db

File tree

2 files changed

+186
-0
lines changed

2 files changed

+186
-0
lines changed

scripts/gha/firebase_github.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,3 +310,61 @@ def list_workflow_runs(token, workflow_id, branch=None, event=None, limit=200):
310310
keep_going = False
311311
results = results[:limit]
312312
return results
313+
314+
315+
def list_jobs_for_workflow_run(token, run_id, attempt=None, limit=200):
316+
"""https://docs.github.com/en/rest/actions/workflow-jobs#list-jobs-for-a-workflow-run
317+
https://docs.github.com/en/rest/actions/workflow-jobs#list-jobs-for-a-workflow-run-attempt
318+
319+
Args:
320+
attempt: Which attempt to fetch. Should be a number >0, 'latest', or 'all'.
321+
If unspecified, returns 'latest'.
322+
"""
323+
if attempt == 'latest' or attempt== 'all' or attempt == None:
324+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/jobs'
325+
else:
326+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/attempts/{attempt}/jobs'
327+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
328+
page = 1
329+
per_page = 100
330+
results = []
331+
keep_going = True
332+
while keep_going:
333+
params = {'per_page': per_page, 'page': page}
334+
if attempt == 'latest' or attempt == 'all':
335+
params.update({'filter': attempt})
336+
page = page + 1
337+
keep_going = False
338+
with requests_retry_session().get(url, headers=headers, params=params,
339+
stream=True, timeout=TIMEOUT) as response:
340+
logging.info("list_jobs_for_workflow_run: %s page %d, response: %s",
341+
url, params['page'], response)
342+
if 'jobs' not in response.json():
343+
break
344+
job_results = response.json()['jobs']
345+
results = results + job_results
346+
# If exactly per_page results were retrieved, read the next page.
347+
keep_going = (len(job_results) == per_page)
348+
if limit > 0 and len(results) >= limit:
349+
keep_going = False
350+
results = results[:limit]
351+
return results
352+
353+
354+
def download_job_logs(token, job_id):
355+
"""https://docs.github.com/en/rest/actions/workflow-jobs#download-job-logs-for-a-workflow-run"""
356+
url = f'{GITHUB_API_URL}/actions/jobs/{job_id}/logs'
357+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
358+
with requests_retry_session().get(url, headers=headers, stream=True, timeout=TIMEOUT) as response:
359+
logging.info("download_job_logs: %s response: %s", url, response)
360+
return response.content.decode('utf-8')
361+
362+
363+
def rerun_failed_jobs_for_workflow_run(token, run_id):
364+
"""https://docs.github.com/en/rest/actions/workflow-runs#re-run-failed-jobs-from-a-workflow-run"""
365+
url = f'{GITHUB_API_URL}/actions/runs/{run_id}/rerun-failed-jobs'
366+
headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {token}'}
367+
with requests.post(url, headers=headers,
368+
stream=True, timeout=TIMEOUT) as response:
369+
logging.info("rerun_failed_jobs_for_workflow_run: %s response: %s", url, response)
370+
return True if response.status_code == 201 else False

scripts/gha/retry_test_failures.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""A utility to retry failed jobs in a workflow run.
16+
17+
USAGE:
18+
python3 scripts/gha/retry_test_failures.py \
19+
--token ${{github.token}} \
20+
--run_id <github_workflow_run_id>
21+
"""
22+
23+
import datetime
24+
import json
25+
import re
26+
import shutil
27+
28+
from absl import app
29+
from absl import flags
30+
from absl import logging
31+
32+
import firebase_github
33+
34+
FLAGS = flags.FLAGS
35+
MAX_RETRIES=2
36+
37+
flags.DEFINE_string(
38+
"token", None,
39+
"github.token: A token to authenticate on your repository.")
40+
41+
flags.DEFINE_string(
42+
"run_id", None,
43+
"Github's workflow run ID.")
44+
45+
46+
def get_log_group(log_text, group_name):
47+
group_log = []
48+
in_group = False
49+
for line in log_text.split("\n"):
50+
line_no_ts = line[29:]
51+
if line_no_ts.startswith('##[group]'):
52+
if group_name in line_no_ts:
53+
print("got group %s" % group_name)
54+
in_group = True
55+
if in_group:
56+
group_log.append(line_no_ts)
57+
if line_no_ts.startswith('##[error])'):
58+
print("end group %s" % group_name)
59+
in_group = False
60+
break
61+
return group_log
62+
63+
def main(argv):
64+
if len(argv) > 1:
65+
raise app.UsageError("Too many command-line arguments.")
66+
# Get list of workflow jobs.
67+
workflow_jobs = firebase_github.list_jobs_for_workflow_run(
68+
FLAGS.token, FLAGS.run_id, attempt='all')
69+
if not workflow_jobs or len(workflow_jobs) == 0:
70+
logging.error("No jobs found for workflow run %s", FLAGS.run_id)
71+
exit(1)
72+
73+
failed_jobs = {}
74+
all_jobs = {}
75+
for job in workflow_jobs:
76+
all_jobs[job['id']] = job
77+
if job['conclusion'] != 'success' and job['conclusion'] != 'skipped':
78+
if job['name'] in failed_jobs:
79+
other_run = failed_jobs[job['name']]
80+
if job['run_attempt'] > other_run['run_attempt']:
81+
# This is a later run than the one that's already there
82+
failed_jobs[job['name']] = job
83+
else:
84+
failed_jobs[job['name']] = job
85+
86+
should_rerun_jobs = False
87+
for job_name in failed_jobs:
88+
job = failed_jobs[job_name]
89+
logging.info('Considering job %s attempt %d: %s (%s)',
90+
job['conclusion'] if job['conclusion'] else job['status'],
91+
job['run_attempt'], job['name'], job['id'])
92+
if job['status'] != 'completed':
93+
# Don't retry a job that is already in progress or queued
94+
logging.info("Not retrying, as %s is already %s",
95+
job['name'], job['status'].replace("_", " "))
96+
should_rerun_jobs = False
97+
break
98+
if job['run_attempt'] > MAX_RETRIES:
99+
# Don't retry a job more than MAX_RETRIES times.
100+
logging.info("Not retrying, as %s has already been attempted %d times",
101+
job['name'], job['run_attempt'])
102+
should_rerun_jobs = False
103+
break
104+
if job['conclusion'] == 'failure':
105+
job_logs = firebase_github.download_job_logs(FLAGS.token, job['id'])
106+
if job['name'].startswith('build-'):
107+
# Retry build jobs that timed out
108+
if re.search(r'timed? ?out|network error|maximum execution time',
109+
job_logs, re.IGNORECASE):
110+
should_rerun_jobs = True
111+
elif job['name'].startswith('test-'):
112+
# Tests should always be retried (for now).
113+
should_rerun_jobs = True
114+
115+
if should_rerun_jobs:
116+
logging.info("Re-running failed jobs in workflow run %s", FLAGS.run_id)
117+
if not firebase_github.rerun_failed_jobs_for_workflow_run(
118+
FLAGS.token, FLAGS.run_id):
119+
logging.error("Error submitting GitHub API request")
120+
exit(1)
121+
else:
122+
logging.info("Not re-running jobs.")
123+
124+
125+
if __name__ == "__main__":
126+
flags.mark_flag_as_required("token")
127+
flags.mark_flag_as_required("run_id")
128+
app.run(main)

0 commit comments

Comments
 (0)