Skip to content

Fix codebase parsing for unicode filenames #560

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/codegen/git/repo_operator/repo_operator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import fnmatch
import glob
import logging
Expand Down Expand Up @@ -70,7 +71,7 @@
else:
os.makedirs(self.repo_path, exist_ok=True)
GitCLI.init(self.repo_path)
self._local_git_repo = LocalGitRepo(repo_path=repo_config.repo_path)

Check failure on line 74 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument "repo_path" to "LocalGitRepo" has incompatible type "str"; expected "Path" [arg-type]

if repo_config.full_name is None:
repo_config.full_name = self._local_git_repo.full_name
Expand Down Expand Up @@ -142,7 +143,7 @@
email_level = None
levels = ["system", "global", "user", "repository"]
for level in levels:
with git_cli.config_reader(level) as reader:

Check failure on line 146 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "config_reader" of "Repo" has incompatible type "str"; expected "Literal['system', 'global', 'user', 'repository'] | None" [arg-type]
if reader.has_option("user", "name") and not username:
username = reader.get("user", "name")
user_level = level
Expand Down Expand Up @@ -554,7 +555,7 @@
return content
except UnicodeDecodeError:
print(f"Warning: Unable to decode file {file_path}. Skipping.")
return None

Check failure on line 558 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible return value type (got "None", expected "str") [return-value]

def write_file(self, relpath: str, content: str) -> None:
"""Writes file content to disk"""
Expand All @@ -577,6 +578,25 @@
if ignore_list:
filepaths = [f for f in filepaths if not any(fnmatch.fnmatch(f, pattern) or f.startswith(pattern) for pattern in ignore_list)]

# Fix bug where unicode characters are not handled correctly
for i, filepath in enumerate(filepaths):
# Check if it is one of the broken cases
if filepath.startswith('"'):
# Step 1: Strip the quotes
filepath = filepath.strip('"').strip("'")

# Step 2: Convert the Python string to raw ASCII bytes (so \\ stays as two 0x5C).
raw_filepath = filepath.encode("ascii")

# Step 3: Use escape_decode to process backslash escapes like \346 -> 0xE6
decoded_filepath, _ = codecs.escape_decode(raw_filepath)

# Step 4: Decode those bytes as UTF-8 to get the actual Unicode text
filepath = decoded_filepath.decode("utf-8")

# Step 5: Replace the original filepath with the decoded filepath
filepaths[i] = filepath

return filepaths

# TODO: unify param naming i.e. subdirectories vs subdirs probably use subdirectories since that's in the DB
Expand All @@ -602,7 +622,7 @@
filepaths = self.get_filepaths_for_repo(ignore_list)
# Iterate through files and yield contents
for rel_filepath in filepaths:
rel_filepath: str

Check failure on line 625 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Name "rel_filepath" already defined on line 624 [no-redef]
filepath = os.path.join(self.repo_path, rel_filepath)

# Filter by subdirectory (includes full filenames)
Expand Down Expand Up @@ -633,7 +653,7 @@
list_files = []

for rel_filepath in self.git_cli.git.ls_files().split("\n"):
rel_filepath: str

Check failure on line 656 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Name "rel_filepath" already defined on line 655 [no-redef]
if subdirs and not any(d in rel_filepath for d in subdirs):
continue
if extensions is None or any(rel_filepath.endswith(e) for e in extensions):
Expand All @@ -657,7 +677,7 @@

def get_modified_files_in_last_n_days(self, days: int = 1) -> tuple[list[str], list[str]]:
"""Returns a list of files modified and deleted in the last n days"""
modified_files = []

Check failure on line 680 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "modified_files" (hint: "modified_files: list[<type>] = ...") [var-annotated]
deleted_files = []
allowed_extensions = [".py"]

Expand All @@ -673,9 +693,9 @@
if file in modified_files:
modified_files.remove(file)
else:
if file not in modified_files and file[-3:] in allowed_extensions:

Check failure on line 696 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Value of type "str | PathLike[str]" is not indexable [index]
modified_files.append(file)
return modified_files, deleted_files

Check failure on line 698 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible return value type (got "tuple[list[str | PathLike[str]], list[str | PathLike[str]]]", expected "tuple[list[str], list[str]]") [return-value]

@cached_property
def base_url(self) -> str | None:
Expand All @@ -696,7 +716,7 @@

def get_pr_data(self, pr_number: int) -> dict:
"""Returns the data associated with a PR"""
return self.remote_git_repo.get_pr_data(pr_number)

Check failure on line 719 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: "GitRepoClient" has no attribute "get_pr_data" [attr-defined]

def create_pr_comment(self, pr_number: int, body: str) -> None:
"""Create a general comment on a pull request.
Expand Down Expand Up @@ -739,7 +759,7 @@
body=body,
commit=commit,
path=path,
line=line,

Check failure on line 762 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument "line" to "create_review_comment" of "GitRepoClient" has incompatible type "int | None"; expected "int | _NotSetType" [arg-type]
side=side,
start_line=start_line,
)
Expand Down
14 changes: 14 additions & 0 deletions tests/unit/codegen/sdk/core/test_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@

from codegen.sdk.codebase.codebase_context import CodebaseContext
from codegen.sdk.codebase.config import CodebaseConfig
from codegen.sdk.codebase.factory.get_session import get_codebase_session
from codegen.sdk.core.directory import Directory
from codegen.sdk.core.file import File
from codegen.shared.enums.programming_language import ProgrammingLanguage


@pytest.fixture
Expand Down Expand Up @@ -220,3 +222,15 @@ def test_get_set_delete_item(mock_directory):

with pytest.raises(KeyError, match="subdir_2"):
del mock_directory["subdir_2"]


def test_unicode_in_filename(tmpdir) -> None:
with get_codebase_session(
tmpdir=tmpdir,
files={"ascii.py": "print('Hello, world!')", "test/我很喜欢冰激淋/test-file 12'3_🍦.py": "print('Hello, world!')"},
programming_language=ProgrammingLanguage.PYTHON,
verify_output=True,
) as codebase:
file = codebase.get_file("test/我很喜欢冰激淋/test-file 12'3_🍦.py")
assert file is not None
assert file.content == "print('Hello, world!')"