Skip to content

Commit 114397e

Browse files
authored
Fix codebase parsing for unicode filenames (#560)
Fixes codebase parsing with unicode filenames.
1 parent 8706b68 commit 114397e

File tree

2 files changed

+34
-0
lines changed

2 files changed

+34
-0
lines changed

src/codegen/git/repo_operator/repo_operator.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
import fnmatch
23
import glob
34
import logging
@@ -577,6 +578,25 @@ def get_filepaths_for_repo(self, ignore_list):
577578
if ignore_list:
578579
filepaths = [f for f in filepaths if not any(fnmatch.fnmatch(f, pattern) or f.startswith(pattern) for pattern in ignore_list)]
579580

581+
# Fix bug where unicode characters are not handled correctly
582+
for i, filepath in enumerate(filepaths):
583+
# Check if it is one of the broken cases
584+
if filepath.startswith('"'):
585+
# Step 1: Strip the quotes
586+
filepath = filepath.strip('"').strip("'")
587+
588+
# Step 2: Convert the Python string to raw ASCII bytes (so \\ stays as two 0x5C).
589+
raw_filepath = filepath.encode("ascii")
590+
591+
# Step 3: Use escape_decode to process backslash escapes like \346 -> 0xE6
592+
decoded_filepath, _ = codecs.escape_decode(raw_filepath)
593+
594+
# Step 4: Decode those bytes as UTF-8 to get the actual Unicode text
595+
filepath = decoded_filepath.decode("utf-8")
596+
597+
# Step 5: Replace the original filepath with the decoded filepath
598+
filepaths[i] = filepath
599+
580600
return filepaths
581601

582602
# TODO: unify param naming i.e. subdirectories vs subdirs probably use subdirectories since that's in the DB

tests/unit/codegen/sdk/core/test_directory.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77

88
from codegen.sdk.codebase.codebase_context import CodebaseContext
99
from codegen.sdk.codebase.config import CodebaseConfig
10+
from codegen.sdk.codebase.factory.get_session import get_codebase_session
1011
from codegen.sdk.core.directory import Directory
1112
from codegen.sdk.core.file import File
13+
from codegen.shared.enums.programming_language import ProgrammingLanguage
1214

1315

1416
@pytest.fixture
@@ -220,3 +222,15 @@ def test_get_set_delete_item(mock_directory):
220222

221223
with pytest.raises(KeyError, match="subdir_2"):
222224
del mock_directory["subdir_2"]
225+
226+
227+
def test_unicode_in_filename(tmpdir) -> None:
228+
with get_codebase_session(
229+
tmpdir=tmpdir,
230+
files={"ascii.py": "print('Hello, world!')", "test/我很喜欢冰激淋/test-file 12'3_🍦.py": "print('Hello, world!')"},
231+
programming_language=ProgrammingLanguage.PYTHON,
232+
verify_output=True,
233+
) as codebase:
234+
file = codebase.get_file("test/我很喜欢冰激淋/test-file 12'3_🍦.py")
235+
assert file is not None
236+
assert file.content == "print('Hello, world!')"

0 commit comments

Comments
 (0)