Skip to content

New Heuristics-based minified file detection #620

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/codegen/sdk/codebase/codebase_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,10 @@
".*/tests/static/chunk-.*.js",
".*/ace/.*.js",
"src/vs/platform/contextview/browser/contextMenuService.ts",
"*/semver.js",
"*/compiled/*",
"*.min.js",
"*@*.js",
]


Expand Down Expand Up @@ -213,7 +215,7 @@
syncs[SyncType.ADD].append(self.to_absolute(filepath))
logger.info(f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions")
self._process_diff_files(syncs, incremental=False)
files: list[SourceFile] = self.get_nodes(NodeType.FILE)

Check failure on line 218 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[SourceFile[Any, Any, Any, Any, Any, Any]]") [assignment]
logger.info(f"> Found {len(files)} files")
logger.info(f"> Found {len(self.nodes)} nodes and {len(self.edges)} edges")
if self.config.track_graph:
Expand Down Expand Up @@ -243,8 +245,8 @@
elif diff.change_type == ChangeType.Modified:
files_to_sync[filepath] = SyncType.REPARSE
elif diff.change_type == ChangeType.Renamed:
files_to_sync[diff.rename_from] = SyncType.DELETE

Check failure on line 248 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
files_to_sync[diff.rename_to] = SyncType.ADD

Check failure on line 249 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
elif diff.change_type == ChangeType.Removed:
files_to_sync[filepath] = SyncType.DELETE
else:
Expand Down Expand Up @@ -281,10 +283,10 @@
files_to_write.append((sync.path, sync.old_content))
modified_files.add(sync.path)
elif sync.change_type == ChangeType.Renamed:
files_to_write.append((sync.rename_from, sync.old_content))

Check failure on line 286 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "append" of "list" has incompatible type "tuple[Path | None, bytes | None]"; expected "tuple[Path, bytes | None]" [arg-type]
files_to_remove.append(sync.rename_to)
modified_files.add(sync.rename_from)

Check failure on line 288 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
modified_files.add(sync.rename_to)

Check failure on line 289 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
elif sync.change_type == ChangeType.Added:
files_to_remove.append(sync.path)
modified_files.add(sync.path)
Expand Down
7 changes: 3 additions & 4 deletions src/codegen/sdk/core/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from codegen.sdk.topological_sort import pseudo_topological_sort
from codegen.sdk.tree_sitter_parser import get_parser_by_filepath_or_extension, parse_file
from codegen.sdk.typescript.function import TSFunction
from codegen.sdk.utils import is_minified_js
from codegen.shared.decorators.docs import apidoc, noapidoc
from codegen.visualizations.enums import VizNode

Expand All @@ -45,8 +46,6 @@

logger = logging.getLogger(__name__)

MINIFIED_FILE_THRESHOLD = 500


@apidoc
class File(Editable[None]):
Expand Down Expand Up @@ -581,8 +580,8 @@ def from_content(cls, filepath: str | PathLike | Path, content: str, ctx: Codeba
path = ctx.to_absolute(filepath)

# Sanity check to ensure file is not a minified file
if any(len(line) >= MINIFIED_FILE_THRESHOLD for line in content.split("\n")):
logger.info(f"File {filepath} is a minified file (Line length < {MINIFIED_FILE_THRESHOLD}). Skipping...", extra={"filepath": filepath})
if is_minified_js(content):
logger.info(f"File {filepath} is a minified file. Skipping...", extra={"filepath": filepath})
return None

ts_node = parse_file(path, content)
Expand Down
57 changes: 57 additions & 0 deletions src/codegen/sdk/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import shutil
import statistics
from collections.abc import Iterable
from contextlib import contextmanager
from xml.dom.minidom import parseString
Expand Down Expand Up @@ -83,7 +84,7 @@

def find_first_function_descendant(node: TSNode) -> TSNode:
type_names = [function_type.value for function_type in TSFunctionTypeNames]
return find_first_descendant(node=node, type_names=type_names, max_depth=2)

Check failure on line 87 in src/codegen/sdk/utils.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible return value type (got "Node | None", expected "Node") [return-value]


def find_index(target: TSNode, siblings: list[TSNode]) -> int:
Expand All @@ -103,7 +104,7 @@
while node is not None and (max_depth is None or depth <= max_depth):
if node.type in type_names:
return node
node = node.parent

Check failure on line 107 in src/codegen/sdk/utils.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "Node | None", variable has type "Node") [assignment]
depth += 1
return None

Expand Down Expand Up @@ -149,9 +150,9 @@

# Want to prevent it from matching with part of the match within a comment
else:
if not ts_match.children:

Check failure on line 153 in src/codegen/sdk/utils.py

View workflow job for this annotation

GitHub Actions / mypy

error: Item "None" of "Node | None" has no attribute "children" [union-attr]
return ts_match
comments = find_all_descendants(ts_match, "comment")

Check failure on line 155 in src/codegen/sdk/utils.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "find_all_descendants" has incompatible type "Node | None"; expected "Node" [arg-type]
# see if any of these comments partially overlaps with the match
if any(comment.start_byte < start_byte < comment.end_byte or comment.start_byte < end_byte < comment.end_byte for comment in comments):
return None
Expand Down Expand Up @@ -245,3 +246,59 @@
if len(input) > max_chars:
return input[:max_chars] + f"...(truncated from {len(input)} characters)."
return input


def is_minified_js(content):
"""Analyzes a string to determine if it contains minified JavaScript code.

Args:
content: String containing JavaScript code to analyze

Returns:
bool: True if the content appears to be minified JavaScript, False otherwise
"""
try:
# Skip empty content
if not content.strip():
return False

# Characteristics of minified JS files
lines = content.split("\n")

# 1. Check for average line length (minified files have very long lines)
line_lengths = [len(line) for line in lines if line.strip()]
if not line_lengths: # Handle empty content case
return False

avg_line_length = statistics.mean(line_lengths)

# 2. Check for semicolon-to-newline ratio (minified often has ; instead of newlines)
semicolons = content.count(";")
newlines = len(lines) - 1
semicolon_ratio = semicolons / max(newlines, 1) # Avoid division by zero

# 3. Check whitespace ratio (minified has low whitespace)
whitespace_chars = len(re.findall(r"[\s]", content))
total_chars = len(content)
whitespace_ratio = whitespace_chars / total_chars if total_chars else 0

# 4. Check for common minification patterns
has_common_patterns = bool(re.search(r"[\w\)]\{[\w:]+\}", content)) # Condensed object notation

# 5. Check for short variable names (common in minified code)
variable_names = re.findall(r"var\s+(\w+)", content)
avg_var_length = statistics.mean([len(name) for name in variable_names]) if variable_names else 0

# Decision logic - tuned threshold values
is_minified = (
(avg_line_length > 250) # Very long average line length
and (semicolon_ratio > 0.8 or has_common_patterns) # High semicolon ratio or minification patterns
and (whitespace_ratio < 0.08) # Very low whitespace ratio
and (avg_var_length < 3 or not variable_names) # Extremely short variable names or no vars
)

return is_minified

except Exception as e:
print(f"Error analyzing content: {e}")
return False
2 changes: 2 additions & 0 deletions tests/unit/codegen/sdk/codebase/file/example.min.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion tests/unit/codegen/sdk/codebase/file/test_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import sys

import pytest
Expand Down Expand Up @@ -214,7 +215,14 @@ def test_files_in_subdirectories_case_sensitivity(tmpdir) -> None:


def test_minified_file(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.min.js": "console.log(123)", "file2.js": f"console.log(1{'0' * 1000})"}) as codebase:
with get_codebase_session(
tmpdir=tmpdir,
files={
"file1.min.js": "console.log(123)",
"file2.js": open(f"{os.path.dirname(__file__)}/example.min.js").read(),
},
programming_language=ProgrammingLanguage.TYPESCRIPT,
) as codebase:
# This should match the `*.min.js` pattern
file1 = codebase.ctx.get_file("file1.min.js")
assert file1 is None
Expand Down