Skip to content

[CG-10860] fix: UnicodeDecodeError when instantiating codebase from repo #565

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/codegen/sdk/codebase/codebase_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@
syncs[SyncType.ADD].append(self.to_absolute(filepath))
logger.info(f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions")
self._process_diff_files(syncs, incremental=False)
files: list[SourceFile] = self.get_nodes(NodeType.FILE)

Check failure on line 200 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[SourceFile[Any, Any, Any, Any, Any, Any]]") [assignment]
logger.info(f"> Found {len(files)} files")
logger.info(f"> Found {len(self.nodes)} nodes and {len(self.edges)} edges")
if self.config.feature_flags.track_graph:
Expand Down Expand Up @@ -227,8 +227,8 @@
elif diff.change_type == ChangeType.Modified:
files_to_sync[filepath] = SyncType.REPARSE
elif diff.change_type == ChangeType.Renamed:
files_to_sync[diff.rename_from] = SyncType.DELETE

Check failure on line 230 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
files_to_sync[diff.rename_to] = SyncType.ADD

Check failure on line 231 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
elif diff.change_type == ChangeType.Removed:
files_to_sync[filepath] = SyncType.DELETE
else:
Expand Down Expand Up @@ -265,16 +265,16 @@
files_to_write.append((sync.path, sync.old_content))
modified_files.add(sync.path)
elif sync.change_type == ChangeType.Renamed:
files_to_write.append((sync.rename_from, sync.old_content))

Check failure on line 268 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "append" of "list" has incompatible type "tuple[Path | None, bytes | None]"; expected "tuple[Path, bytes | None]" [arg-type]
files_to_remove.append(sync.rename_to)
modified_files.add(sync.rename_from)

Check failure on line 270 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
modified_files.add(sync.rename_to)

Check failure on line 271 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
elif sync.change_type == ChangeType.Added:
files_to_remove.append(sync.path)
modified_files.add(sync.path)
logger.info(f"Writing {len(files_to_write)} files to disk and removing {len(files_to_remove)} files")
for file in files_to_remove:
self.io.delete_file(file)

Check failure on line 277 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "delete_file" of "IO" has incompatible type "Path | None"; expected "Path" [arg-type]
to_save = set()
for file, content in files_to_write:
self.io.write_file(file, content)
Expand Down Expand Up @@ -325,7 +325,7 @@
for module in external_modules:
if not any(self.predecessors(module.node_id)):
self.remove_node(module.node_id)
self._ext_module_idx.pop(module._idx_key, None)

Check failure on line 328 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Importable[Any]" has no attribute "_idx_key" [attr-defined]

def build_directory_tree(self, files: list[SourceFile]) -> None:
"""Builds the directory tree for the codebase"""
Expand All @@ -334,7 +334,7 @@
created_dirs = set()
for file in files:
directory = self.get_directory(file.path.parent, create_on_missing=True)
directory.add_file(file)

Check failure on line 337 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Item "None" of "Directory[Any, Any, Any, Any, Any, Any, Any] | None" has no attribute "add_file" [union-attr]
file._set_directory(directory)
created_dirs.add(file.path.parent)

Expand All @@ -361,7 +361,7 @@
If create_on_missing is set, use a recursive strategy to create the directory object and all subdirectories.
"""
# If not part of repo path, return None
absolute_path = self.to_absolute(directory_path)

Check failure on line 364 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "__call__" of "_lru_cache_wrapper" has incompatible type "PathLike[Any]"; expected "Hashable" [arg-type]
if not self.is_subdir(absolute_path):
assert False, f"Directory {absolute_path} is not part of repo path {self.repo_path}"
return None
Expand Down Expand Up @@ -474,7 +474,11 @@
task = self.progress.begin("Adding new files", count=len(files_to_sync[SyncType.ADD]))
for idx, filepath in enumerate(files_to_sync[SyncType.ADD]):
task.update(f"Adding {self.to_relative(filepath)}", count=idx)
content = self.io.read_text(filepath)
try:
content = self.io.read_text(filepath)
except UnicodeDecodeError as e:
logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!")
continue
# TODO: this is wrong with context changes
if filepath.suffix in self.extensions:
file_cls = self.node_classes.file_cls
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,40 @@ def __init__(self):
assert len(import_resolution_edges) == 4
assert len(file_contains_node_edges) == 14
assert len(symbol_usage_edges) == 6


def test_codebase_broken_file(tmpdir) -> None:
# language=python
content = """
from some_file import x, y, z
import numpy as np

global_var_1 = 1
global_var_2 = 2

def foo():
return bar()

def bar():
return 42

class MyClass:
def __init__(self):
pass

class MySubClass(MyClass):
def __init__(self):
super().__init__()
pass
"""
content_broken = bytes("你好", "big5hkscs")
with get_codebase_session(tmpdir=tmpdir, files={"test.py": content, "test2.py": content_broken}) as codebase:
assert codebase is not None
assert isinstance(codebase.ctx, CodebaseContext)
import_resolution_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.IMPORT_SYMBOL_RESOLUTION]
file_contains_node_edges = list(itertools.chain.from_iterable(file.get_nodes() for file in codebase.files))
symbol_usage_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.SYMBOL_USAGE]

assert len(import_resolution_edges) == 4
assert len(file_contains_node_edges) == 14
assert len(symbol_usage_edges) == 6