Skip to content

Commit 8934539

Browse files
committed
Add script to reduce example html
1 parent d87079b commit 8934539

File tree

3 files changed

+545
-387
lines changed

3 files changed

+545
-387
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import argparse
2+
import re
3+
from pathlib import Path
4+
5+
from bs4 import BeautifulSoup, NavigableString, Tag
6+
7+
# ---------- Argument Parsing ----------
8+
DATA_URI_RE = re.compile(r"^\s*data:", re.I)
9+
10+
11+
# ---------- Helper functions ----------
12+
def redact_binary_attrs(tag: Tag) -> None:
13+
"""Replace base64‐encoded `src` / `srcset` with a placeholder."""
14+
if tag.name == "img" and tag.has_attr("src") and DATA_URI_RE.match(tag["src"]):
15+
tag["src"] = "redacted"
16+
if (
17+
tag.name == "source"
18+
and tag.has_attr("srcset")
19+
and DATA_URI_RE.match(tag["srcset"])
20+
):
21+
tag["srcset"] = "redacted"
22+
23+
24+
def is_leaf(tag: Tag) -> bool:
25+
"""Returns True if `tag` has no child *elements* (text is allowed)."""
26+
return not any(isinstance(c, Tag) for c in tag.contents)
27+
28+
29+
def clean_node(tag: Tag) -> None:
30+
"""Recursively clean a BeautifulSoup element in-place."""
31+
if isinstance(tag, NavigableString):
32+
return
33+
34+
# 1. strip style attributes
35+
tag.attrs.pop("style", None)
36+
37+
# 2. redact binary embeds
38+
redact_binary_attrs(tag)
39+
40+
# 3. special-case SVG: keep the wrapper to preserve structure
41+
if tag.name == "svg":
42+
tag.clear()
43+
tag.append("<!-- redacted -->")
44+
45+
# 4. recurse
46+
for child in list(tag.children):
47+
clean_node(child)
48+
49+
50+
def prune_top_level(soup: BeautifulSoup) -> None:
51+
"""Remove high-volume, low-value nodes entirely."""
52+
for n in soup.find_all(["script", "style", "noscript"]):
53+
n.decompose()
54+
for link in soup.find_all("link", rel=lambda x: x and "stylesheet" in x):
55+
link.decompose()
56+
57+
58+
def clean_html(raw_html: str) -> str:
59+
soup = BeautifulSoup(raw_html, "lxml")
60+
prune_top_level(soup)
61+
clean_node(soup)
62+
# Pretty formatting keeps the tree readable yet compact
63+
return soup.prettify(formatter="minimal")
64+
65+
66+
# ---------- Execute cleaning ----------
67+
def main():
68+
parser = argparse.ArgumentParser(
69+
description="Clean and reduce HTML files by removing unnecessary content."
70+
)
71+
parser.add_argument("input_path", help="Path to the input HTML file")
72+
parser.add_argument("output_path", help="Path where the cleaned HTML will be saved")
73+
args = parser.parse_args()
74+
raw_html = Path(args.input_path).read_text(encoding="utf-8", errors="ignore")
75+
76+
# Update MAX_LEAF_CHARS based on command line argument
77+
78+
cleaned_html = clean_html(raw_html)
79+
80+
# Save to disk
81+
Path(args.output_path).write_text(cleaned_html, encoding="utf-8")
82+
83+
# Show a short preview to the user
84+
print(f"Saved cleaned HTML to {args.output_path}")
85+
print(f"Original size: {len(raw_html):,} characters")
86+
print(f"Cleaned size : {len(cleaned_html):,} characters")
87+
88+
89+
if __name__ == "__main__":
90+
main()

python/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ convert_jsondoc = "jsondoc.bin.convert_jsondoc:main"
2121
dev = [
2222
"datamodel-code-generator>=0.25.9,<0.26",
2323
"ipdb>=0.13.13",
24+
"lxml>=5.4.0",
2425
"pre-commit>=4.1.0",
2526
"pytest>=8.3.5",
2627
"python-dotenv>=1.0.1",

0 commit comments

Comments
 (0)