Skip to content

Commit 6647e02

Browse files
authored
New Heuristics-based minified file detection (#620)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 5d94f61 commit 6647e02

File tree

5 files changed

+73
-5
lines changed

5 files changed

+73
-5
lines changed

src/codegen/sdk/codebase/codebase_context.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,10 @@
6666
".*/tests/static/chunk-.*.js",
6767
".*/ace/.*.js",
6868
"src/vs/platform/contextview/browser/contextMenuService.ts",
69+
"*/semver.js",
6970
"*/compiled/*",
7071
"*.min.js",
72+
"*@*.js",
7173
]
7274

7375

src/codegen/sdk/core/file.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from codegen.sdk.topological_sort import pseudo_topological_sort
3535
from codegen.sdk.tree_sitter_parser import get_parser_by_filepath_or_extension, parse_file
3636
from codegen.sdk.typescript.function import TSFunction
37+
from codegen.sdk.utils import is_minified_js
3738
from codegen.shared.decorators.docs import apidoc, noapidoc
3839
from codegen.visualizations.enums import VizNode
3940

@@ -45,8 +46,6 @@
4546

4647
logger = logging.getLogger(__name__)
4748

48-
MINIFIED_FILE_THRESHOLD = 500
49-
5049

5150
@apidoc
5251
class File(Editable[None]):
@@ -581,8 +580,8 @@ def from_content(cls, filepath: str | PathLike | Path, content: str, ctx: Codeba
581580
path = ctx.to_absolute(filepath)
582581

583582
# Sanity check to ensure file is not a minified file
584-
if any(len(line) >= MINIFIED_FILE_THRESHOLD for line in content.split("\n")):
585-
logger.info(f"File {filepath} is a minified file (Line length < {MINIFIED_FILE_THRESHOLD}). Skipping...", extra={"filepath": filepath})
583+
if is_minified_js(content):
584+
logger.info(f"File {filepath} is a minified file. Skipping...", extra={"filepath": filepath})
586585
return None
587586

588587
ts_node = parse_file(path, content)

src/codegen/sdk/utils.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import re
33
import shutil
4+
import statistics
45
from collections.abc import Iterable
56
from contextlib import contextmanager
67
from xml.dom.minidom import parseString
@@ -245,3 +246,59 @@ def truncate_line(input: str, max_chars: int) -> str:
245246
if len(input) > max_chars:
246247
return input[:max_chars] + f"...(truncated from {len(input)} characters)."
247248
return input
249+
250+
251+
def is_minified_js(content):
252+
"""Analyzes a string to determine if it contains minified JavaScript code.
253+
254+
Args:
255+
content: String containing JavaScript code to analyze
256+
257+
Returns:
258+
bool: True if the content appears to be minified JavaScript, False otherwise
259+
"""
260+
try:
261+
# Skip empty content
262+
if not content.strip():
263+
return False
264+
265+
# Characteristics of minified JS files
266+
lines = content.split("\n")
267+
268+
# 1. Check for average line length (minified files have very long lines)
269+
line_lengths = [len(line) for line in lines if line.strip()]
270+
if not line_lengths: # Handle empty content case
271+
return False
272+
273+
avg_line_length = statistics.mean(line_lengths)
274+
275+
# 2. Check for semicolon-to-newline ratio (minified often has ; instead of newlines)
276+
semicolons = content.count(";")
277+
newlines = len(lines) - 1
278+
semicolon_ratio = semicolons / max(newlines, 1) # Avoid division by zero
279+
280+
# 3. Check whitespace ratio (minified has low whitespace)
281+
whitespace_chars = len(re.findall(r"[\s]", content))
282+
total_chars = len(content)
283+
whitespace_ratio = whitespace_chars / total_chars if total_chars else 0
284+
285+
# 4. Check for common minification patterns
286+
has_common_patterns = bool(re.search(r"[\w\)]\{[\w:]+\}", content)) # Condensed object notation
287+
288+
# 5. Check for short variable names (common in minified code)
289+
variable_names = re.findall(r"var\s+(\w+)", content)
290+
avg_var_length = statistics.mean([len(name) for name in variable_names]) if variable_names else 0
291+
292+
# Decision logic - tuned threshold values
293+
is_minified = (
294+
(avg_line_length > 250) # Very long average line length
295+
and (semicolon_ratio > 0.8 or has_common_patterns) # High semicolon ratio or minification patterns
296+
and (whitespace_ratio < 0.08) # Very low whitespace ratio
297+
and (avg_var_length < 3 or not variable_names) # Extremely short variable names or no vars
298+
)
299+
300+
return is_minified
301+
302+
except Exception as e:
303+
print(f"Error analyzing content: {e}")
304+
return False

tests/unit/codegen/sdk/codebase/file/example.min.js

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/unit/codegen/sdk/codebase/file/test_file.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import sys
23

34
import pytest
@@ -214,7 +215,14 @@ def test_files_in_subdirectories_case_sensitivity(tmpdir) -> None:
214215

215216

216217
def test_minified_file(tmpdir) -> None:
217-
with get_codebase_session(tmpdir=tmpdir, files={"file1.min.js": "console.log(123)", "file2.js": f"console.log(1{'0' * 1000})"}) as codebase:
218+
with get_codebase_session(
219+
tmpdir=tmpdir,
220+
files={
221+
"file1.min.js": "console.log(123)",
222+
"file2.js": open(f"{os.path.dirname(__file__)}/example.min.js").read(),
223+
},
224+
programming_language=ProgrammingLanguage.TYPESCRIPT,
225+
) as codebase:
218226
# This should match the `*.min.js` pattern
219227
file1 = codebase.ctx.get_file("file1.min.js")
220228
assert file1 is None

0 commit comments

Comments
 (0)