1
1
import collections
2
2
import datetime
3
+ import dateutil
3
4
import github
4
5
import logging
5
6
import os
53
54
# by trial and error).
54
55
GRAFANA_METRIC_MAX_AGE_MN = 120
55
56
57
+ # Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
58
+ # the metric name in Grafana. This is important not to lose metrics history
59
+ # if the workflow name changes.
60
+ BUILDKITE_WORKFLOW_TO_TRACK = {
61
+ ":linux: Linux x64" : "buildkite_linux" ,
62
+ ":windows: Windows x64" : "buildkite_windows" ,
63
+ }
64
+
65
+ # Number of builds to fetch per page. Since we scrape regularly, this can
66
+ # remain small.
67
+ BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
68
+
56
69
@dataclass
57
70
class JobMetrics :
58
71
job_name : str
@@ -70,6 +83,170 @@ class GaugeMetric:
70
83
time_ns : int
71
84
72
85
86
+ def buildkite_fetch_page_build_list (
87
+ buildkite_token : str , after_cursor : str = None
88
+ ) -> list [dict [str , str ]]:
89
+ """Fetches a page of the build list using the GraphQL BuildKite API.
90
+ Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
91
+ or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
92
+ older than the one pointer by |after_cursor| if provided.
93
+ The |after_cursor| value is taken from the previous page returned by the
94
+ API.
95
+ Args:
96
+ buildkite_token: the secret token to authenticate GraphQL requests.
97
+ after_cursor: cursor after which to start the page fetch.
98
+ Returns:
99
+ The most recent builds after cursor (if set) with the following format:
100
+ [
101
+ {
102
+ "cursor": <value>,
103
+ "number": <build-number>,
104
+ }
105
+ ]
106
+ """
107
+
108
+ BUILDKITE_GRAPHQL_QUERY = """
109
+ query OrganizationShowQuery {{
110
+ organization(slug: "llvm-project") {{
111
+ pipelines(search: "Github pull requests", first: 1) {{
112
+ edges {{
113
+ node {{
114
+ builds (state: [RUNNING, SCHEDULED, CREATING], first: {PAGE_SIZE}, after: {AFTER}) {{
115
+ edges {{
116
+ cursor
117
+ node {{
118
+ number
119
+ }}
120
+ }}
121
+ }}
122
+ }}
123
+ }}
124
+ }}
125
+ }}
126
+ }}
127
+ """
128
+ data = BUILDKITE_GRAPHQL_QUERY .format (
129
+ PAGE_SIZE = BUILDKITE_GRAPHQL_BUILDS_PER_PAGE ,
130
+ AFTER = "null" if after_cursor is None else '"{}"' .format (after_cursor ),
131
+ )
132
+ data = data .replace ("\n " , "" ).replace ('"' , '\\ "' )
133
+ data = '{ "query": "' + data + '" }'
134
+ url = "https://graphql.buildkite.com/v1"
135
+ headers = {
136
+ "Authorization" : "Bearer " + buildkite_token ,
137
+ "Content-Type" : "application/json" ,
138
+ }
139
+ r = requests .post (url , data = data , headers = headers )
140
+ data = r .json ()
141
+ # De-nest the build list.
142
+ if "errors" in data :
143
+ logging .info ("Failed to fetch BuildKite jobs: {}" .format (data ["errors" ]))
144
+ return []
145
+ builds = data ["data" ]["organization" ]["pipelines" ]["edges" ][0 ]["node" ]["builds" ][
146
+ "edges"
147
+ ]
148
+ # Fold cursor info into the node dictionnary.
149
+ return [{** x ["node" ], "cursor" : x ["cursor" ]} for x in builds ]
150
+
151
+
152
+ def buildkite_get_build_info (build_number : str ) -> dict :
153
+ """Returns all the info associated with the provided build number.
154
+ Note: for unknown reasons, graphql returns no jobs for a given build,
155
+ while this endpoint does, hence why this uses this API instead of graphql.
156
+ Args:
157
+ build_number: which build number to fetch info for.
158
+ Returns:
159
+ The info for the target build, a JSON dictionnary.
160
+ """
161
+
162
+ URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
163
+ return requests .get (URL .format (build_number )).json ()
164
+
165
+
166
+ def buildkite_get_incomplete_tasks (buildkite_token : str ) -> list :
167
+ """Returns all the running/pending BuildKite builds.
168
+ Args:
169
+ buildkite_token: the secret token to authenticate GraphQL requests.
170
+ last_cursor: the cursor to stop at if set. If None, a full page is fetched.
171
+ """
172
+ output = []
173
+ cursor = None
174
+ while True :
175
+ page = buildkite_fetch_page_build_list (buildkite_token , cursor )
176
+ if len (page ) == 0 :
177
+ break
178
+ cursor = page [- 1 ]["cursor" ]
179
+ output += page
180
+ return output
181
+
182
+
183
+ def buildkite_get_metrics (
184
+ buildkite_token : str , previously_incomplete : set [int ]
185
+ ) -> (list [JobMetrics ], set [int ]):
186
+ """Returns a tuple with:
187
+ - the metrics recorded for newly completed workflow jobs.
188
+ - the set of workflow still running now.
189
+
190
+ Args:
191
+ buildkite_token: the secret token to authenticate GraphQL requests.
192
+ previously_incomplete: the set of running workflows the last time this
193
+ function was called.
194
+ """
195
+
196
+ running_builds = buildkite_get_incomplete_tasks (buildkite_token )
197
+ incomplete_now = set ([x ["number" ] for x in running_builds ])
198
+ output = []
199
+
200
+ for build_id in previously_incomplete :
201
+ if build_id in incomplete_now :
202
+ continue
203
+
204
+ info = buildkite_get_build_info (build_id )
205
+ metric_timestamp = dateutil .parser .isoparse (info ["finished_at" ])
206
+ for job in info ["jobs" ]:
207
+ # Skip this job.
208
+ if job ["name" ] not in BUILDKITE_WORKFLOW_TO_TRACK :
209
+ continue
210
+
211
+ created_at = dateutil .parser .isoparse (job ["created_at" ])
212
+ scheduled_at = dateutil .parser .isoparse (job ["scheduled_at" ])
213
+ started_at = dateutil .parser .isoparse (job ["started_at" ])
214
+ finished_at = dateutil .parser .isoparse (job ["finished_at" ])
215
+
216
+ job_name = BUILDKITE_WORKFLOW_TO_TRACK [job ["name" ]]
217
+ queue_time = (started_at - scheduled_at ).seconds
218
+ run_time = (finished_at - started_at ).seconds
219
+ status = bool (job ["passed" ])
220
+
221
+ # Grafana will refuse to ingest metrics older than ~2 hours, so we
222
+ # should avoid sending historical data.
223
+ metric_age_mn = (
224
+ datetime .datetime .now (datetime .timezone .utc ) - metric_timestamp
225
+ ).total_seconds () / 60
226
+ if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
227
+ logging .info (
228
+ f"Job { job ['name' ]} from workflow { build_id } dropped due"
229
+ + f" to staleness: { metric_age_mn } mn old."
230
+ )
231
+ continue
232
+
233
+ metric_timestamp_ns = int (metric_timestamp .timestamp ()) * 10 ** 9
234
+ workflow_id = build_id
235
+ workflow_name = "Github pull requests"
236
+ output .append (
237
+ JobMetrics (
238
+ job_name ,
239
+ queue_time ,
240
+ run_time ,
241
+ status ,
242
+ metric_timestamp_ns ,
243
+ workflow_id ,
244
+ workflow_name ,
245
+ )
246
+ )
247
+
248
+ return output , incomplete_now
249
+
73
250
def github_get_metrics (
74
251
github_repo : github .Repository , last_workflows_seen_as_completed : set [int ]
75
252
) -> tuple [list [JobMetrics ], int ]:
@@ -276,23 +453,34 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
276
453
def main ():
277
454
# Authenticate with Github
278
455
github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
456
+ buildkite_token = os .environ ["BUILDKITE_TOKEN" ]
279
457
grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
280
458
grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
281
459
282
460
# The last workflow this script processed.
283
461
# Because the Github queries are broken, we'll simply log a 'processed'
284
462
# bit for the last COUNT_TO_PROCESS workflows.
285
463
gh_last_workflows_seen_as_completed = set ()
464
+ # Stores the list of pending/running builds in BuildKite we need to check
465
+ # at the next iteration.
466
+ bk_incomplete = set ()
286
467
287
468
# Enter the main loop. Every five minutes we wake up and dump metrics for
288
469
# the relevant jobs.
289
470
while True :
290
471
github_object = Github (auth = github_auth )
291
472
github_repo = github_object .get_repo ("llvm/llvm-project" )
292
473
293
- metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
474
+ gh_metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
294
475
github_repo , gh_last_workflows_seen_as_completed
295
476
)
477
+ gh_metrics = []
478
+
479
+ bk_metrics , bk_incomplete = buildkite_get_metrics (
480
+ buildkite_token , bk_incomplete
481
+ )
482
+
483
+ metrics = gh_metrics + bk_metrics
296
484
upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
297
485
logging .info (f"Uploaded { len (metrics )} metrics" )
298
486
0 commit comments