Skip to content

Commit d502ba8

Browse files
committed
Don't reparse symbol pages (speedup up to 1.5x)
1 parent 09a4217 commit d502ba8

File tree

1 file changed

+30
-23
lines changed

1 file changed

+30
-23
lines changed

clang/tools/include-mapping/cppreference_parser.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def _HasClass(tag, *classes):
4040
return False
4141

4242

43-
def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
43+
def _ParseSymbolPage(symbol_page_html, symbols):
4444
"""Parse symbol page and retrieve the include header defined in this page.
4545
The symbol page provides header for the symbol, specifically in
4646
"Defined in header <header>" section. An example:
@@ -51,8 +51,12 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
5151
5252
Returns a list of headers.
5353
"""
54-
headers = set()
54+
headers = collections.defaultdict(set)
5555
all_headers = set()
56+
symbol_names = {}
57+
for symbol_name, qualified_symbol_name in symbols:
58+
symbol_names[symbol_name] = symbol_name
59+
symbol_names[qualified_symbol_name] = symbol_name
5660

5761
soup = BeautifulSoup(symbol_page_html, "html.parser")
5862
# Rows in table are like:
@@ -69,9 +73,10 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
6973
was_decl = True
7074
# Symbols are in the first cell.
7175
found_symbols = row.find("td").stripped_strings
72-
if not (symbol_name in found_symbols or qual_name in found_symbols):
73-
continue
74-
headers.update(current_headers)
76+
for found_symbol in found_symbols:
77+
symbol_name = symbol_names.get(found_symbol)
78+
if symbol_name:
79+
headers[symbol_name].update(current_headers)
7580
elif _HasClass(row, "t-dsc-header"):
7681
# If we saw a decl since the last header, this is a new block of headers
7782
# for a new block of decls.
@@ -86,26 +91,28 @@ def _ParseSymbolPage(symbol_page_html, symbol_name, qual_name):
8691
current_headers.append(header_code.text)
8792
all_headers.add(header_code.text)
8893
# If the symbol was never named, consider all named headers.
89-
return headers or all_headers
94+
return [
95+
(symbol_name, headers.get(symbol_name) or all_headers)
96+
for symbol_name, _ in symbols
97+
]
9098

9199

92100
def _ParseSymbolVariant(caption):
93101
if not (isinstance(caption, NavigableString) and "(" in caption):
94102
return None
95103

96-
if ')' in caption.text: # (locale), (algorithm), etc.
104+
if ")" in caption.text: # (locale), (algorithm), etc.
97105
return caption.text.strip(" ()")
98106

99107
second_part = caption.next_sibling
100108
if isinstance(second_part, Tag) and second_part.name == "code":
101109
# (<code>std::complex</code>), etc.
102110
third_part = second_part.next_sibling
103-
if isinstance(third_part, NavigableString) and third_part.text.startswith(')'):
111+
if isinstance(third_part, NavigableString) and third_part.text.startswith(")"):
104112
return second_part.text
105113
return None
106114

107115

108-
109116
def _ParseIndexPage(index_page_html):
110117
"""Parse index page.
111118
The index page lists all std symbols and hrefs to their detailed pages
@@ -137,9 +144,9 @@ def _ParseIndexPage(index_page_html):
137144
return symbols
138145

139146

140-
def _ReadSymbolPage(path, name, qual_name):
147+
def _ReadSymbolPage(path, symbols):
141148
with open(path) as f:
142-
return _ParseSymbolPage(f.read(), name, qual_name)
149+
return _ParseSymbolPage(f.read(), symbols)
143150

144151

145152
def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
@@ -158,33 +165,33 @@ def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
158165
with open(index_page_path, "r") as f:
159166
# Read each symbol page in parallel.
160167
results = [] # (symbol_name, promise of [header...])
168+
symbols_by_page = collections.defaultdict(list)
161169
for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
162170
# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
163171
# FIXME: use these as a fallback rather than ignoring entirely.
164172
qualified_symbol_name = (namespace or "") + symbol_name
165-
variants_for_symbol = variants_to_accept.get(
166-
qualified_symbol_name, ()
167-
)
173+
variants_for_symbol = variants_to_accept.get(qualified_symbol_name, ())
168174
if variant and variant not in variants_for_symbol:
169175
continue
170176
path = os.path.join(root_dir, symbol_page_path)
171-
if os.path.isfile(path):
172-
results.append(
173-
(
174-
symbol_name,
175-
pool.apply_async(_ReadSymbolPage, (path, symbol_name, qualified_symbol_name)),
176-
)
177-
)
177+
if path in symbols_by_page or os.path.isfile(path):
178+
symbols_by_page[path].append((symbol_name, qualified_symbol_name))
178179
else:
179180
sys.stderr.write(
180181
"Discarding information for symbol: %s. Page %s does not exist.\n"
181182
% (symbol_name, path)
182183
)
183184

185+
for path, symbols in symbols_by_page.items():
186+
results.append(
187+
pool.apply_async(_ReadSymbolPage, (path, symbols)),
188+
)
189+
184190
# Build map from symbol name to a set of headers.
185191
symbol_headers = collections.defaultdict(set)
186-
for symbol_name, lazy_headers in results:
187-
symbol_headers[symbol_name].update(lazy_headers.get())
192+
for lazy_mapping in results:
193+
for symbol_name, headers in lazy_mapping.get():
194+
symbol_headers[symbol_name].update(headers)
188195

189196
symbols = []
190197
for name, headers in sorted(symbol_headers.items(), key=lambda t: t[0]):

0 commit comments

Comments
 (0)