Skip to content

Add heuristics-based minified file detection #619

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/codegen/sdk/codebase/codebase_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
".*/ace/.*.js",
"src/vs/platform/contextview/browser/contextMenuService.ts",
"*/compiled/*",
"*/*.min.js",
"*.min.js",
]


Expand Down Expand Up @@ -213,7 +213,7 @@
syncs[SyncType.ADD].append(self.to_absolute(filepath))
logger.info(f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions")
self._process_diff_files(syncs, incremental=False)
files: list[SourceFile] = self.get_nodes(NodeType.FILE)

Check failure on line 216 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[SourceFile[Any, Any, Any, Any, Any, Any]]") [assignment]
logger.info(f"> Found {len(files)} files")
logger.info(f"> Found {len(self.nodes)} nodes and {len(self.edges)} edges")
if self.config.track_graph:
Expand Down Expand Up @@ -243,8 +243,8 @@
elif diff.change_type == ChangeType.Modified:
files_to_sync[filepath] = SyncType.REPARSE
elif diff.change_type == ChangeType.Renamed:
files_to_sync[diff.rename_from] = SyncType.DELETE

Check failure on line 246 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
files_to_sync[diff.rename_to] = SyncType.ADD

Check failure on line 247 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
elif diff.change_type == ChangeType.Removed:
files_to_sync[filepath] = SyncType.DELETE
else:
Expand Down Expand Up @@ -281,16 +281,16 @@
files_to_write.append((sync.path, sync.old_content))
modified_files.add(sync.path)
elif sync.change_type == ChangeType.Renamed:
files_to_write.append((sync.rename_from, sync.old_content))

Check failure on line 284 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "append" of "list" has incompatible type "tuple[Path | None, bytes | None]"; expected "tuple[Path, bytes | None]" [arg-type]
files_to_remove.append(sync.rename_to)
modified_files.add(sync.rename_from)

Check failure on line 286 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
modified_files.add(sync.rename_to)

Check failure on line 287 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
elif sync.change_type == ChangeType.Added:
files_to_remove.append(sync.path)
modified_files.add(sync.path)
logger.info(f"Writing {len(files_to_write)} files to disk and removing {len(files_to_remove)} files")
for file in files_to_remove:
self.io.delete_file(file)

Check failure on line 293 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "delete_file" of "IO" has incompatible type "Path | None"; expected "Path" [arg-type]
to_save = set()
for file, content in files_to_write:
self.io.write_file(file, content)
Expand Down Expand Up @@ -341,7 +341,7 @@
for module in external_modules:
if not any(self.predecessors(module.node_id)):
self.remove_node(module.node_id)
self._ext_module_idx.pop(module._idx_key, None)

Check failure on line 344 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Importable[Any]" has no attribute "_idx_key" [attr-defined]

def build_directory_tree(self, files: list[SourceFile]) -> None:
"""Builds the directory tree for the codebase"""
Expand All @@ -350,7 +350,7 @@
created_dirs = set()
for file in files:
directory = self.get_directory(file.path.parent, create_on_missing=True)
directory.add_file(file)

Check failure on line 353 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Item "None" of "Directory[Any, Any, Any, Any, Any, Any, Any] | None" has no attribute "add_file" [union-attr]
file._set_directory(directory)
created_dirs.add(file.path.parent)

Expand All @@ -377,7 +377,7 @@
If create_on_missing is set, use a recursive strategy to create the directory object and all subdirectories.
"""
# If not part of repo path, return None
absolute_path = self.to_absolute(directory_path)

Check failure on line 380 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "__call__" of "_lru_cache_wrapper" has incompatible type "PathLike[Any]"; expected "Hashable" [arg-type]
if not self.is_subdir(absolute_path):
assert False, f"Directory {absolute_path} is not part of repo path {self.repo_path}"
return None
Expand Down
8 changes: 8 additions & 0 deletions src/codegen/sdk/core/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@

logger = logging.getLogger(__name__)

MINIFIED_FILE_THRESHOLD = 500


@apidoc
class File(Editable[None]):
Expand Down Expand Up @@ -577,6 +579,12 @@ def invalidate(self):
def from_content(cls, filepath: str | PathLike | Path, content: str, ctx: CodebaseContext, sync: bool = True, verify_syntax: bool = True) -> Self | None:
"""Creates a new file from content and adds it to the graph."""
path = ctx.to_absolute(filepath)

# Sanity check to ensure file is not a minified file
if any(len(line) >= MINIFIED_FILE_THRESHOLD for line in content.split("\n")):
logger.info(f"File {filepath} is a minified file (Line length < {MINIFIED_FILE_THRESHOLD}). Skipping...", extra={"filepath": filepath})
return None

ts_node = parse_file(path, content)
if ts_node.has_error and verify_syntax:
logger.info("Failed to parse file %s", filepath)
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/codegen/sdk/codebase/file/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,14 @@ def test_files_in_subdirectories_case_sensitivity(tmpdir) -> None:
assert codebase.has_file("SubDir3/File3.py", ignore_case=False)
assert not codebase.has_file("SUBDIR3/FILE3.py", ignore_case=False)
assert not codebase.has_file("subdir3/file3.py", ignore_case=False)


def test_minified_file(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.min.js": "console.log(123)", "file2.js": f"console.log(1{'0' * 1000})"}) as codebase:
# This should match the `*.min.js` pattern
file1 = codebase.ctx.get_file("file1.min.js")
assert file1 is None

# This should match the maximum line length threshold
file2 = codebase.ctx.get_file("file2.js")
assert file2 is None