Skip to content

Commit 039f6a0

Browse files
tomcodgentkfoss
andauthored
[CG-10860] fix: UnicodeDecodeError when instantiating codebase from repo (#565)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed --------- Co-authored-by: tomcodgen <[email protected]>
1 parent a38cac3 commit 039f6a0

File tree

2 files changed

+42
-1
lines changed

2 files changed

+42
-1
lines changed

src/codegen/sdk/codebase/codebase_context.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,11 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr
474474
task = self.progress.begin("Adding new files", count=len(files_to_sync[SyncType.ADD]))
475475
for idx, filepath in enumerate(files_to_sync[SyncType.ADD]):
476476
task.update(f"Adding {self.to_relative(filepath)}", count=idx)
477-
content = self.io.read_text(filepath)
477+
try:
478+
content = self.io.read_text(filepath)
479+
except UnicodeDecodeError as e:
480+
logger.warning(f"Can't read file at:{filepath} since it contains non-unicode characters. File will be ignored!")
481+
continue
478482
# TODO: this is wrong with context changes
479483
if filepath.suffix in self.extensions:
480484
file_cls = self.node_classes.file_cls

tests/unit/codegen/sdk/codebase/codebase_graph/test_codebase_graph.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,40 @@ def __init__(self):
5858
assert len(import_resolution_edges) == 4
5959
assert len(file_contains_node_edges) == 14
6060
assert len(symbol_usage_edges) == 6
61+
62+
63+
def test_codebase_broken_file(tmpdir) -> None:
64+
# language=python
65+
content = """
66+
from some_file import x, y, z
67+
import numpy as np
68+
69+
global_var_1 = 1
70+
global_var_2 = 2
71+
72+
def foo():
73+
return bar()
74+
75+
def bar():
76+
return 42
77+
78+
class MyClass:
79+
def __init__(self):
80+
pass
81+
82+
class MySubClass(MyClass):
83+
def __init__(self):
84+
super().__init__()
85+
pass
86+
"""
87+
content_broken = bytes("你好", "big5hkscs")
88+
with get_codebase_session(tmpdir=tmpdir, files={"test.py": content, "test2.py": content_broken}) as codebase:
89+
assert codebase is not None
90+
assert isinstance(codebase.ctx, CodebaseContext)
91+
import_resolution_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.IMPORT_SYMBOL_RESOLUTION]
92+
file_contains_node_edges = list(itertools.chain.from_iterable(file.get_nodes() for file in codebase.files))
93+
symbol_usage_edges = [edge for edge in codebase.ctx.edges if edge[2].type == EdgeType.SYMBOL_USAGE]
94+
95+
assert len(import_resolution_edges) == 4
96+
assert len(file_contains_node_edges) == 14
97+
assert len(symbol_usage_edges) == 6

0 commit comments

Comments
 (0)