45
45
import re
46
46
import subprocess
47
47
import sys
48
- from typing import Generator , List , NamedTuple , Iterable
48
+ from typing import Dict , Generator , Iterable , List , NamedTuple , Optional , Tuple
49
49
50
50
assert sys .version_info >= (3 , 6 ), "Only Python 3.6+ is supported."
51
51
52
52
# People are creative with their reverts, and heuristics are a bit difficult.
53
- # Like 90% of of reverts have "This reverts commit ${full_sha}".
54
- # Some lack that entirely, while others have many of them specified in ad-hoc
55
- # ways, while others use short SHAs and whatever.
53
+ # At a glance, most reverts have "This reverts commit ${full_sha}". Many others
54
+ # have `Reverts llvm/llvm-project#${PR_NUMBER}`.
56
55
#
57
- # The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
58
- # starts involving human intervention, which is probably not worth it for now.
56
+ # By their powers combined, we should be able to automatically catch something
57
+ # like 80% of reverts with reasonable confidence. At some point, human
58
+ # intervention will always be required (e.g., I saw
59
+ # ```
60
+ # This reverts commit ${commit_sha_1} and
61
+ # also ${commit_sha_2_shorthand}
62
+ # ```
63
+ # during my sample)
64
+
65
+ _CommitMessageReverts = NamedTuple (
66
+ "_CommitMessageReverts" ,
67
+ [
68
+ ("potential_shas" , List [str ]),
69
+ ("potential_pr_numbers" , List [int ]),
70
+ ],
71
+ )
72
+
59
73
74
+ def _try_parse_reverts_from_commit_message (
75
+ commit_message : str ,
76
+ ) -> _CommitMessageReverts :
77
+ """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
60
78
61
- def _try_parse_reverts_from_commit_message (commit_message : str ) -> List [str ]:
79
+ Returns:
80
+ A namedtuple containing:
81
+ - A list of potentially reverted SHAs
82
+ - A list of potentially reverted LLVM PR numbers
83
+ """
62
84
if not commit_message :
63
- return []
85
+ return _CommitMessageReverts ([], [])
64
86
65
- results = re .findall (r"This reverts commit ([a-f0-9]{40})\b" , commit_message )
87
+ sha_reverts = re .findall (
88
+ r"This reverts commit ([a-f0-9]{40})\b" ,
89
+ commit_message ,
90
+ )
66
91
67
92
first_line = commit_message .splitlines ()[0 ]
68
93
initial_revert = re .match (r'Revert ([a-f0-9]{6,}) "' , first_line )
69
94
if initial_revert :
70
- results .append (initial_revert .group (1 ))
71
- return results
95
+ sha_reverts .append (initial_revert .group (1 ))
72
96
97
+ pr_numbers = [
98
+ int (x )
99
+ for x in re .findall (
100
+ r"Reverts llvm/llvm-project#(\d+)" ,
101
+ commit_message ,
102
+ )
103
+ ]
104
+
105
+ return _CommitMessageReverts (
106
+ potential_shas = sha_reverts ,
107
+ potential_pr_numbers = pr_numbers ,
108
+ )
73
109
74
- def _stream_stdout (command : List [str ]) -> Generator [str , None , None ]:
110
+
111
+ def _stream_stdout (
112
+ command : List [str ], cwd : Optional [str ] = None
113
+ ) -> Generator [str , None , None ]:
75
114
with subprocess .Popen (
76
- command , stdout = subprocess .PIPE , encoding = "utf-8" , errors = "replace"
115
+ command ,
116
+ cwd = cwd ,
117
+ stdout = subprocess .PIPE ,
118
+ encoding = "utf-8" ,
119
+ errors = "replace" ,
77
120
) as p :
78
121
assert p .stdout is not None # for mypy's happiness.
79
122
yield from p .stdout
@@ -175,10 +218,43 @@ def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
175
218
).strip ()
176
219
177
220
178
- def find_reverts (git_dir : str , across_ref : str , root : str ) -> List [Revert ]:
221
+ def _load_pr_commit_mappings (
222
+ git_dir : str , root : str , min_ref : str
223
+ ) -> Dict [int , List [str ]]:
224
+ git_log = ["git" , "log" , "--format=%H %s" , f"{ min_ref } ..{ root } " ]
225
+ results = collections .defaultdict (list )
226
+ pr_regex = re .compile (r"\s\(#(\d+)\)$" )
227
+ for line in _stream_stdout (git_log , cwd = git_dir ):
228
+ m = pr_regex .search (line )
229
+ if not m :
230
+ continue
231
+
232
+ pr_number = int (m .group (1 ))
233
+ sha = line .split (None , 1 )[0 ]
234
+ # N.B., these are kept in log (read: reverse chronological) order,
235
+ # which is what's expected by `find_reverts`.
236
+ results [pr_number ].append (sha )
237
+ return results
238
+
239
+
240
+ # N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
241
+ # enough for the 99% case of reverts: rarely should someone land a cleanish
242
+ # revert of a >6 month old change...
243
+ def find_reverts (
244
+ git_dir : str , across_ref : str , root : str , max_pr_lookback : int = 20000
245
+ ) -> List [Revert ]:
179
246
"""Finds reverts across `across_ref` in `git_dir`, starting from `root`.
180
247
181
248
These reverts are returned in order of oldest reverts first.
249
+
250
+ Args:
251
+ git_dir: git directory to find reverts in.
252
+ across_ref: the ref to find reverts across.
253
+ root: the 'main' ref to look for reverts on.
254
+ max_pr_lookback: this function uses heuristics to map PR numbers to
255
+ SHAs. These heuristics require that commit history from `root` to
256
+ `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
257
+ many commits behind `across_ref` should be loaded in memory.
182
258
"""
183
259
across_sha = _rev_parse (git_dir , across_ref )
184
260
root_sha = _rev_parse (git_dir , root )
@@ -201,8 +277,41 @@ def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
201
277
)
202
278
203
279
all_reverts = []
280
+ # Lazily load PR <-> commit mappings, since it can be expensive.
281
+ pr_commit_mappings = None
204
282
for sha , commit_message in _log_stream (git_dir , root_sha , across_sha ):
205
- reverts = _try_parse_reverts_from_commit_message (commit_message )
283
+ reverts , pr_reverts = _try_parse_reverts_from_commit_message (
284
+ commit_message ,
285
+ )
286
+ if pr_reverts :
287
+ if pr_commit_mappings is None :
288
+ logging .info (
289
+ "Loading PR <-> commit mappings. This may take a moment..."
290
+ )
291
+ pr_commit_mappings = _load_pr_commit_mappings (
292
+ git_dir , root_sha , f"{ across_sha } ~{ max_pr_lookback } "
293
+ )
294
+ logging .info (
295
+ "Loaded %d PR <-> commit mappings" , len (pr_commit_mappings )
296
+ )
297
+
298
+ for reverted_pr_number in pr_reverts :
299
+ reverted_shas = pr_commit_mappings .get (reverted_pr_number )
300
+ if not reverted_shas :
301
+ logging .warning (
302
+ "No SHAs for reverted PR %d (commit %s)" ,
303
+ reverted_pr_number ,
304
+ sha ,
305
+ )
306
+ continue
307
+ logging .debug (
308
+ "Inferred SHAs %s for reverted PR %d (commit %s)" ,
309
+ reverted_shas ,
310
+ reverted_pr_number ,
311
+ sha ,
312
+ )
313
+ reverts .extend (reverted_shas )
314
+
206
315
if not reverts :
207
316
continue
208
317
0 commit comments