Skip to content

Commit 92754a3

Browse files
committed
[CI] Extend metrics container to log BuildKite metrics
The current container focuses on Github metrics. Before deprecating BuildKite, we want to make sure the new infra quality is better, or at least the same. Being able to compare buildkite metrics with github metrics on grafana will allow us to easily present the comparison. BuildKite API allows filtering, but doesn't allow changing the result ordering. Meaning we are left with builds ordered by IDs. This means a completed job can appear before a running job in the list. 2 solutions from there: - keep the cursor on the oldest running workflow - keep a list of running workflows to compare. Because there is no guarantees in workflow ordering, waiting for the oldest build to complete before reporting any newer build could mean delaying the more recent build completion reporting by a few hours. And because grafana cannot ingest metrics older than 2 hours, this is not an option. Thus we leave with the second solution: remember what jobs were running during the last iteration, and record them as soon as they are completed. Buildkite has at most ~100 pending jobs, so keeping all those IDs should be OK.
1 parent d22d143 commit 92754a3

File tree

1 file changed

+189
-1
lines changed

1 file changed

+189
-1
lines changed

.ci/metrics/metrics.py

Lines changed: 189 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import collections
22
import datetime
3+
import dateutil
34
import github
45
import logging
56
import os
@@ -53,6 +54,18 @@
5354
# by trial and error).
5455
GRAFANA_METRIC_MAX_AGE_MN = 120
5556

57+
# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
58+
# the metric name in Grafana. This is important not to lose metrics history
59+
# if the workflow name changes.
60+
BUILDKITE_WORKFLOW_TO_TRACK = {
61+
":linux: Linux x64": "buildkite_linux",
62+
":windows: Windows x64": "buildkite_windows",
63+
}
64+
65+
# Number of builds to fetch per page. Since we scrape regularly, this can
66+
# remain small.
67+
BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
68+
5669
@dataclass
5770
class JobMetrics:
5871
job_name: str
@@ -70,6 +83,170 @@ class GaugeMetric:
7083
time_ns: int
7184

7285

86+
def buildkite_fetch_page_build_list(
87+
buildkite_token: str, after_cursor: str = None
88+
) -> list[dict[str, str]]:
89+
"""Fetches a page of the build list using the GraphQL BuildKite API.
90+
Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
91+
or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
92+
older than the one pointer by |after_cursor| if provided.
93+
The |after_cursor| value is taken from the previous page returned by the
94+
API.
95+
Args:
96+
buildkite_token: the secret token to authenticate GraphQL requests.
97+
after_cursor: cursor after which to start the page fetch.
98+
Returns:
99+
The most recent builds after cursor (if set) with the following format:
100+
[
101+
{
102+
"cursor": <value>,
103+
"number": <build-number>,
104+
}
105+
]
106+
"""
107+
108+
BUILDKITE_GRAPHQL_QUERY = """
109+
query OrganizationShowQuery {{
110+
organization(slug: "llvm-project") {{
111+
pipelines(search: "Github pull requests", first: 1) {{
112+
edges {{
113+
node {{
114+
builds (state: [RUNNING, SCHEDULED, CREATING], first: {PAGE_SIZE}, after: {AFTER}) {{
115+
edges {{
116+
cursor
117+
node {{
118+
number
119+
}}
120+
}}
121+
}}
122+
}}
123+
}}
124+
}}
125+
}}
126+
}}
127+
"""
128+
data = BUILDKITE_GRAPHQL_QUERY.format(
129+
PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
130+
AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
131+
)
132+
data = data.replace("\n", "").replace('"', '\\"')
133+
data = '{ "query": "' + data + '" }'
134+
url = "https://graphql.buildkite.com/v1"
135+
headers = {
136+
"Authorization": "Bearer " + buildkite_token,
137+
"Content-Type": "application/json",
138+
}
139+
r = requests.post(url, data=data, headers=headers)
140+
data = r.json()
141+
# De-nest the build list.
142+
if "errors" in data:
143+
logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"]))
144+
return []
145+
builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
146+
"edges"
147+
]
148+
# Fold cursor info into the node dictionnary.
149+
return [{**x["node"], "cursor": x["cursor"]} for x in builds]
150+
151+
152+
def buildkite_get_build_info(build_number: str) -> dict:
153+
"""Returns all the info associated with the provided build number.
154+
Note: for unknown reasons, graphql returns no jobs for a given build,
155+
while this endpoint does, hence why this uses this API instead of graphql.
156+
Args:
157+
build_number: which build number to fetch info for.
158+
Returns:
159+
The info for the target build, a JSON dictionnary.
160+
"""
161+
162+
URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
163+
return requests.get(URL.format(build_number)).json()
164+
165+
166+
def buildkite_get_incomplete_tasks(buildkite_token: str) -> list:
167+
"""Returns all the running/pending BuildKite builds.
168+
Args:
169+
buildkite_token: the secret token to authenticate GraphQL requests.
170+
last_cursor: the cursor to stop at if set. If None, a full page is fetched.
171+
"""
172+
output = []
173+
cursor = None
174+
while True:
175+
page = buildkite_fetch_page_build_list(buildkite_token, cursor)
176+
if len(page) == 0:
177+
break
178+
cursor = page[-1]["cursor"]
179+
output += page
180+
return output
181+
182+
183+
def buildkite_get_metrics(
184+
buildkite_token: str, previously_incomplete: set[int]
185+
) -> (list[JobMetrics], set[int]):
186+
"""Returns a tuple with:
187+
- the metrics recorded for newly completed workflow jobs.
188+
- the set of workflow still running now.
189+
190+
Args:
191+
buildkite_token: the secret token to authenticate GraphQL requests.
192+
previously_incomplete: the set of running workflows the last time this
193+
function was called.
194+
"""
195+
196+
running_builds = buildkite_get_incomplete_tasks(buildkite_token)
197+
incomplete_now = set([x["number"] for x in running_builds])
198+
output = []
199+
200+
for build_id in previously_incomplete:
201+
if build_id in incomplete_now:
202+
continue
203+
204+
info = buildkite_get_build_info(build_id)
205+
metric_timestamp = dateutil.parser.isoparse(info["finished_at"])
206+
for job in info["jobs"]:
207+
# Skip this job.
208+
if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
209+
continue
210+
211+
created_at = dateutil.parser.isoparse(job["created_at"])
212+
scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
213+
started_at = dateutil.parser.isoparse(job["started_at"])
214+
finished_at = dateutil.parser.isoparse(job["finished_at"])
215+
216+
job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
217+
queue_time = (started_at - scheduled_at).seconds
218+
run_time = (finished_at - started_at).seconds
219+
status = bool(job["passed"])
220+
221+
# Grafana will refuse to ingest metrics older than ~2 hours, so we
222+
# should avoid sending historical data.
223+
metric_age_mn = (
224+
datetime.datetime.now(datetime.timezone.utc) - metric_timestamp
225+
).total_seconds() / 60
226+
if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN:
227+
logging.info(
228+
f"Job {job['name']} from workflow {build_id} dropped due"
229+
+ f" to staleness: {metric_age_mn}mn old."
230+
)
231+
continue
232+
233+
metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9
234+
workflow_id = build_id
235+
workflow_name = "Github pull requests"
236+
output.append(
237+
JobMetrics(
238+
job_name,
239+
queue_time,
240+
run_time,
241+
status,
242+
metric_timestamp_ns,
243+
workflow_id,
244+
workflow_name,
245+
)
246+
)
247+
248+
return output, incomplete_now
249+
73250
def github_get_metrics(
74251
github_repo: github.Repository, last_workflows_seen_as_completed: set[int]
75252
) -> tuple[list[JobMetrics], int]:
@@ -276,23 +453,34 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
276453
def main():
277454
# Authenticate with Github
278455
github_auth = Auth.Token(os.environ["GITHUB_TOKEN"])
456+
buildkite_token = os.environ["BUILDKITE_TOKEN"]
279457
grafana_api_key = os.environ["GRAFANA_API_KEY"]
280458
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
281459

282460
# The last workflow this script processed.
283461
# Because the Github queries are broken, we'll simply log a 'processed'
284462
# bit for the last COUNT_TO_PROCESS workflows.
285463
gh_last_workflows_seen_as_completed = set()
464+
# Stores the list of pending/running builds in BuildKite we need to check
465+
# at the next iteration.
466+
bk_incomplete = set()
286467

287468
# Enter the main loop. Every five minutes we wake up and dump metrics for
288469
# the relevant jobs.
289470
while True:
290471
github_object = Github(auth=github_auth)
291472
github_repo = github_object.get_repo("llvm/llvm-project")
292473

293-
metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
474+
gh_metrics, gh_last_workflows_seen_as_completed = github_get_metrics(
294475
github_repo, gh_last_workflows_seen_as_completed
295476
)
477+
gh_metrics = []
478+
479+
bk_metrics, bk_incomplete = buildkite_get_metrics(
480+
buildkite_token, bk_incomplete
481+
)
482+
483+
metrics = gh_metrics + bk_metrics
296484
upload_metrics(metrics, grafana_metrics_userid, grafana_api_key)
297485
logging.info(f"Uploaded {len(metrics)} metrics")
298486

0 commit comments

Comments
 (0)