|
| 1 | +import argparse |
| 2 | +import re |
| 3 | +from pathlib import Path |
| 4 | + |
| 5 | +from bs4 import BeautifulSoup, NavigableString, Tag |
| 6 | + |
| 7 | +# ---------- Argument Parsing ---------- |
| 8 | +DATA_URI_RE = re.compile(r"^\s*data:", re.I) |
| 9 | + |
| 10 | + |
| 11 | +# ---------- Helper functions ---------- |
| 12 | +def redact_binary_attrs(tag: Tag) -> None: |
| 13 | + """Replace base64‐encoded `src` / `srcset` with a placeholder.""" |
| 14 | + if tag.name == "img" and tag.has_attr("src") and DATA_URI_RE.match(tag["src"]): |
| 15 | + tag["src"] = "redacted" |
| 16 | + if ( |
| 17 | + tag.name == "source" |
| 18 | + and tag.has_attr("srcset") |
| 19 | + and DATA_URI_RE.match(tag["srcset"]) |
| 20 | + ): |
| 21 | + tag["srcset"] = "redacted" |
| 22 | + |
| 23 | + |
| 24 | +def is_leaf(tag: Tag) -> bool: |
| 25 | + """Returns True if `tag` has no child *elements* (text is allowed).""" |
| 26 | + return not any(isinstance(c, Tag) for c in tag.contents) |
| 27 | + |
| 28 | + |
| 29 | +def clean_node(tag: Tag) -> None: |
| 30 | + """Recursively clean a BeautifulSoup element in-place.""" |
| 31 | + if isinstance(tag, NavigableString): |
| 32 | + return |
| 33 | + |
| 34 | + # 1. strip style attributes |
| 35 | + tag.attrs.pop("style", None) |
| 36 | + |
| 37 | + # 2. redact binary embeds |
| 38 | + redact_binary_attrs(tag) |
| 39 | + |
| 40 | + # 3. special-case SVG: keep the wrapper to preserve structure |
| 41 | + if tag.name == "svg": |
| 42 | + tag.clear() |
| 43 | + tag.append("<!-- redacted -->") |
| 44 | + |
| 45 | + # 4. recurse |
| 46 | + for child in list(tag.children): |
| 47 | + clean_node(child) |
| 48 | + |
| 49 | + |
| 50 | +def prune_top_level(soup: BeautifulSoup) -> None: |
| 51 | + """Remove high-volume, low-value nodes entirely.""" |
| 52 | + for n in soup.find_all(["script", "style", "noscript"]): |
| 53 | + n.decompose() |
| 54 | + for link in soup.find_all("link", rel=lambda x: x and "stylesheet" in x): |
| 55 | + link.decompose() |
| 56 | + |
| 57 | + |
| 58 | +def clean_html(raw_html: str) -> str: |
| 59 | + soup = BeautifulSoup(raw_html, "lxml") |
| 60 | + prune_top_level(soup) |
| 61 | + clean_node(soup) |
| 62 | + # Pretty formatting keeps the tree readable yet compact |
| 63 | + return soup.prettify(formatter="minimal") |
| 64 | + |
| 65 | + |
| 66 | +# ---------- Execute cleaning ---------- |
| 67 | +def main(): |
| 68 | + parser = argparse.ArgumentParser( |
| 69 | + description="Clean and reduce HTML files by removing unnecessary content." |
| 70 | + ) |
| 71 | + parser.add_argument("input_path", help="Path to the input HTML file") |
| 72 | + parser.add_argument("output_path", help="Path where the cleaned HTML will be saved") |
| 73 | + args = parser.parse_args() |
| 74 | + raw_html = Path(args.input_path).read_text(encoding="utf-8", errors="ignore") |
| 75 | + |
| 76 | + # Update MAX_LEAF_CHARS based on command line argument |
| 77 | + |
| 78 | + cleaned_html = clean_html(raw_html) |
| 79 | + |
| 80 | + # Save to disk |
| 81 | + Path(args.output_path).write_text(cleaned_html, encoding="utf-8") |
| 82 | + |
| 83 | + # Show a short preview to the user |
| 84 | + print(f"Saved cleaned HTML to {args.output_path}") |
| 85 | + print(f"Original size: {len(raw_html):,} characters") |
| 86 | + print(f"Cleaned size : {len(cleaned_html):,} characters") |
| 87 | + |
| 88 | + |
| 89 | +if __name__ == "__main__": |
| 90 | + main() |
0 commit comments