Skip to content

Major Refactor to Codebase Directory #615

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/codegen/extensions/tools/list_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,13 @@ def get_directory_info(dir_obj: Directory, current_depth: int) -> DirectoryInfo:
"""Helper function to get directory info recursively."""
# Get direct files (always include files unless at max depth)
all_files = []
for file in dir_obj.files:
for file in dir_obj.files(recursive=True):
if file.directory == dir_obj:
all_files.append(file.filepath.split("/")[-1])

# Get direct subdirectories
subdirs = []
for subdir in dir_obj.subdirectories:
for subdir in dir_obj.subdirectories(recursive=True):
# Only include direct descendants
if subdir.parent == dir_obj:
if current_depth > 1 or current_depth == -1:
Expand Down
6 changes: 5 additions & 1 deletion src/codegen/git/repo_operator/repo_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@
email_level = None
levels = ["system", "global", "user", "repository"]
for level in levels:
with git_cli.config_reader(level) as reader:

Check failure on line 146 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "config_reader" of "Repo" has incompatible type "str"; expected "Literal['system', 'global', 'user', 'repository'] | None" [arg-type]
if reader.has_option("user", "name") and not username:
username = reader.get("user", "name")
user_level = level
Expand Down Expand Up @@ -555,7 +555,7 @@
return content
except UnicodeDecodeError:
print(f"Warning: Unable to decode file {file_path}. Skipping.")
return None

Check failure on line 558 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible return value type (got "None", expected "str") [return-value]

def write_file(self, relpath: str, content: str) -> None:
"""Writes file content to disk"""
Expand All @@ -571,7 +571,11 @@
def get_filepaths_for_repo(self, ignore_list):
# Get list of files to iterate over based on gitignore setting
if self.repo_config.respect_gitignore:
filepaths = self.git_cli.git.ls_files().split("\n")
# ls-file flags:
# -c: show cached files
# -o: show other / untracked files
# --exclude-standard: exclude standard gitignore rules
filepaths = self.git_cli.git.ls_files("-co", "--exclude-standard").split("\n")
else:
filepaths = glob.glob("**", root_dir=self.repo_path, recursive=True, include_hidden=True)
# Filter filepaths by ignore list.
Expand Down Expand Up @@ -622,7 +626,7 @@
filepaths = self.get_filepaths_for_repo(ignore_list)
# Iterate through files and yield contents
for rel_filepath in filepaths:
rel_filepath: str

Check failure on line 629 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Name "rel_filepath" already defined on line 628 [no-redef]
filepath = os.path.join(self.repo_path, rel_filepath)

# Filter by subdirectory (includes full filenames)
Expand Down Expand Up @@ -653,7 +657,7 @@
list_files = []

for rel_filepath in self.git_cli.git.ls_files().split("\n"):
rel_filepath: str

Check failure on line 660 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Name "rel_filepath" already defined on line 659 [no-redef]
if subdirs and not any(d in rel_filepath for d in subdirs):
continue
if extensions is None or any(rel_filepath.endswith(e) for e in extensions):
Expand All @@ -677,7 +681,7 @@

def get_modified_files_in_last_n_days(self, days: int = 1) -> tuple[list[str], list[str]]:
"""Returns a list of files modified and deleted in the last n days"""
modified_files = []

Check failure on line 684 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "modified_files" (hint: "modified_files: list[<type>] = ...") [var-annotated]
deleted_files = []
allowed_extensions = [".py"]

Expand All @@ -693,9 +697,9 @@
if file in modified_files:
modified_files.remove(file)
else:
if file not in modified_files and file[-3:] in allowed_extensions:

Check failure on line 700 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Value of type "str | PathLike[str]" is not indexable [index]
modified_files.append(file)
return modified_files, deleted_files

Check failure on line 702 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible return value type (got "tuple[list[str | PathLike[str]], list[str | PathLike[str]]]", expected "tuple[list[str], list[str]]") [return-value]

@cached_property
def base_url(self) -> str | None:
Expand All @@ -716,9 +720,9 @@

def get_pr_data(self, pr_number: int) -> dict:
"""Returns the data associated with a PR"""
return self.remote_git_repo.get_pr_data(pr_number)

Check failure on line 723 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: "GitRepoClient" has no attribute "get_pr_data" [attr-defined]

def create_pr_comment(self, pr_number: int, body: str) -> IssueComment:

Check failure on line 725 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Missing return statement [return]
"""Create a general comment on a pull request.

Args:
Expand Down Expand Up @@ -760,7 +764,7 @@
body=body,
commit=commit,
path=path,
line=line,

Check failure on line 767 in src/codegen/git/repo_operator/repo_operator.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument "line" to "create_review_comment" of "GitRepoClient" has incompatible type "int | None"; expected "int | _NotSetType" [arg-type]
side=side,
)

Expand Down
61 changes: 25 additions & 36 deletions src/codegen/sdk/codebase/codebase_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
from codegen.sdk.core.dataclasses.usage import Usage
from codegen.sdk.core.expressions import Expression
from codegen.sdk.core.external_module import ExternalModule
from codegen.sdk.core.file import SourceFile
from codegen.sdk.core.file import File, SourceFile
from codegen.sdk.core.interfaces.importable import Importable
from codegen.sdk.core.node_id_factory import NodeId
from codegen.sdk.core.parser import Parser
Expand Down Expand Up @@ -343,33 +343,15 @@ def prune_graph(self) -> None:
self.remove_node(module.node_id)
self._ext_module_idx.pop(module._idx_key, None)

def build_directory_tree(self, files: list[SourceFile]) -> None:
def build_directory_tree(self) -> None:
"""Builds the directory tree for the codebase"""
# Reset and rebuild the directory tree
self.directories = dict()
created_dirs = set()
for file in files:
directory = self.get_directory(file.path.parent, create_on_missing=True)
directory.add_file(file)
file._set_directory(directory)
created_dirs.add(file.path.parent)

def _dir_has_file(filepath):
gen = os.scandir(filepath)
while entry := next(gen, None):
if entry.is_file():
return True
return False

for ctx in self.projects:
for rel_filepath in ctx.repo_operator.get_filepaths_for_repo(GLOBAL_FILE_IGNORE_LIST):
abs_filepath = self.to_absolute(rel_filepath)
if not abs_filepath.is_dir():
abs_filepath = abs_filepath.parent

if abs_filepath not in created_dirs and self.is_subdir(abs_filepath) and _dir_has_file(abs_filepath):
directory = self.get_directory(abs_filepath, create_on_missing=True)
created_dirs.add(abs_filepath)

for file_path, _ in self.projects[0].repo_operator.iter_files(subdirs=self.projects[0].subdirectories, ignore_list=GLOBAL_FILE_IGNORE_LIST):
file_path = Path(file_path)
directory = self.get_directory(file_path.parent, create_on_missing=True)
directory._add_file(file_path.name)

def get_directory(self, directory_path: PathLike, create_on_missing: bool = False, ignore_case: bool = False) -> Directory | None:
"""Returns the directory object for the given path, or None if the directory does not exist.
Expand Down Expand Up @@ -397,16 +379,16 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals

# Base Case
if str(absolute_path) == str(self.repo_path) or str(absolute_path) == str(parent_path):
root_directory = Directory(path=absolute_path, dirpath="", parent=None)
root_directory = Directory(ctx=self, path=absolute_path, dirpath="")
self.directories[absolute_path] = root_directory
return root_directory

# Recursively create the parent directory
parent = self.get_directory(parent_path, create_on_missing=True)
# Create the directory
directory = Directory(path=absolute_path, dirpath=str(self.to_relative(absolute_path)), parent=parent)
directory = Directory(ctx=self, path=absolute_path, dirpath=str(self.to_relative(absolute_path)))
# Add the directory to the parent
parent.add_subdirectory(directory)
parent._add_subdirectory(directory.name)
# Add the directory to the tree
self.directories[absolute_path] = directory
return directory
Expand Down Expand Up @@ -512,7 +494,7 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr
# Step 6: Build directory tree
logger.info("> Building directory tree")
files = [f for f in sort_editables(self.get_nodes(NodeType.FILE), alphabetical=True, dedupe=False)]
self.build_directory_tree(files)
self.build_directory_tree()

# Step 7: Build configs
if self.config_parser is not None:
Expand Down Expand Up @@ -611,13 +593,20 @@ def get_file(self, file_path: os.PathLike, ignore_case: bool = False) -> SourceF
if node_id is not None:
return self.get_node(node_id)
if ignore_case:
parent = self.to_absolute(file_path).parent
if parent == Path(self.repo_path):
for file in self.to_absolute(self.repo_path).iterdir():
if str(file_path).lower() == str(self.to_absolute(file)).lower():
return self.get_file(file, ignore_case=False)
if directory := self.get_directory(parent, ignore_case=ignore_case):
return directory.get_file(os.path.basename(file_path), ignore_case=ignore_case)
# Using `get_directory` so that the case insensitive lookup works
parent = self.get_directory(self.to_absolute(file_path).parent, ignore_case=ignore_case).path
for file in parent.iterdir():
if str(file_path).lower() == str(self.to_relative(file)).lower():
return self.get_file(file, ignore_case=False)

def _get_raw_file_from_path(self, path: Path) -> File | None:
from codegen.sdk.core.file import File

try:
return File.from_content(path, self.io.read_text(path), self, sync=False)
except UnicodeDecodeError:
# Handle when file is a binary file
return File.from_content(path, self.io.read_bytes(path), self, sync=False, binary=True)

def get_external_module(self, module: str, import_name: str) -> ExternalModule | None:
node_id = self._ext_module_idx.get(module + "::" + import_name, None)
Expand Down
37 changes: 12 additions & 25 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,37 +522,24 @@ def get_file(self, filepath: str, *, optional: bool = False, ignore_case: bool =
Raises:
ValueError: If file not found and optional=False.
"""

def get_file_from_path(path: Path) -> File | None:
try:
return File.from_content(path, self.ctx.io.read_text(path), self.ctx, sync=False)
except UnicodeDecodeError:
# Handle when file is a binary file
return File.from_content(path, self.ctx.io.read_bytes(path), self.ctx, sync=False, binary=True)

# Try to get the file from the graph first
file = self.ctx.get_file(filepath, ignore_case=ignore_case)
if file is not None:
return file
# If the file is not in the graph, check the filesystem
absolute_path = self.ctx.to_absolute(filepath)
if absolute_path.suffix in self.ctx.extensions and not self.ctx.io.file_exists(absolute_path):
return None
if self.ctx.io.file_exists(absolute_path):
return get_file_from_path(absolute_path)
elif ignore_case:
parent = absolute_path.parent
if parent == Path(self.ctx.repo_path):
for file in self.ctx.to_absolute(self.ctx.repo_path).iterdir():
if str(absolute_path).lower() == str(file).lower():
return get_file_from_path(file)
else:
dir = self.ctx.get_directory(parent, ignore_case=ignore_case)
if dir is None:
return None
for file in dir.path.iterdir():
if str(absolute_path).lower() == str(file).lower():
return get_file_from_path(file)
elif not optional:
return self.ctx._get_raw_file_from_path(absolute_path)
# If the file is not in the graph, check the filesystem
if absolute_path.parent.exists():
for file in absolute_path.parent.iterdir():
if ignore_case and str(absolute_path).lower() == str(file).lower():
return self.ctx._get_raw_file_from_path(file)
elif not ignore_case and str(absolute_path) == str(file):
return self.ctx._get_raw_file_from_path(file)

# If we get here, the file is not found
if not optional:
msg = f"File {filepath} not found in codebase. Use optional=True to return None instead."
raise ValueError(msg)
return None
Expand Down
Loading