Skip to content

Commit 3f33909

Browse files
authored
Major Refactor to Codebase Directory (#615)
**Existing Implementation:** - Mirrors how `codebase.files` operates - `dir.files` returns a recursive list of all source files - `dir.subdirectories` returns a recursive list of all directories - A reference between the file and directory is built during parse time (files hold a reference to dir, and dirs hold references to files) **New Implementation:** - Mirrors more closely to the conceptual idea of a directory - `dir.files` returns all top level source files - `dir.files` now has a `extensions` argument just like codebase - `dir.subdirectories` returns all top level directories - Both `dir.files` and `dir.subdirectories` have a `recursive=True` flag to get everything (similar to old behaviour) - Introduces a `dir.tree` api to recursively list everything - `dir.items` is now a list - Directories and files are grabbed on the fly instead of being linked during parse time (file grabs directories using ctx.get_directory on the fly and vice versa)
1 parent 3bb79bf commit 3f33909

File tree

10 files changed

+410
-484
lines changed

10 files changed

+410
-484
lines changed

src/codegen/extensions/tools/list_directory.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,13 @@ def get_directory_info(dir_obj: Directory, current_depth: int) -> DirectoryInfo:
140140
"""Helper function to get directory info recursively."""
141141
# Get direct files (always include files unless at max depth)
142142
all_files = []
143-
for file in dir_obj.files:
143+
for file in dir_obj.files(recursive=True):
144144
if file.directory == dir_obj:
145145
all_files.append(file.filepath.split("/")[-1])
146146

147147
# Get direct subdirectories
148148
subdirs = []
149-
for subdir in dir_obj.subdirectories:
149+
for subdir in dir_obj.subdirectories(recursive=True):
150150
# Only include direct descendants
151151
if subdir.parent == dir_obj:
152152
if current_depth > 1 or current_depth == -1:

src/codegen/git/repo_operator/repo_operator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,11 @@ def delete_file(self, path: str) -> None:
571571
def get_filepaths_for_repo(self, ignore_list):
572572
# Get list of files to iterate over based on gitignore setting
573573
if self.repo_config.respect_gitignore:
574-
filepaths = self.git_cli.git.ls_files().split("\n")
574+
# ls-file flags:
575+
# -c: show cached files
576+
# -o: show other / untracked files
577+
# --exclude-standard: exclude standard gitignore rules
578+
filepaths = self.git_cli.git.ls_files("-co", "--exclude-standard").split("\n")
575579
else:
576580
filepaths = glob.glob("**", root_dir=self.repo_path, recursive=True, include_hidden=True)
577581
# Filter filepaths by ignore list.

src/codegen/sdk/codebase/codebase_context.py

Lines changed: 25 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from codegen.sdk.core.dataclasses.usage import Usage
4747
from codegen.sdk.core.expressions import Expression
4848
from codegen.sdk.core.external_module import ExternalModule
49-
from codegen.sdk.core.file import SourceFile
49+
from codegen.sdk.core.file import File, SourceFile
5050
from codegen.sdk.core.interfaces.importable import Importable
5151
from codegen.sdk.core.node_id_factory import NodeId
5252
from codegen.sdk.core.parser import Parser
@@ -345,33 +345,15 @@ def prune_graph(self) -> None:
345345
self.remove_node(module.node_id)
346346
self._ext_module_idx.pop(module._idx_key, None)
347347

348-
def build_directory_tree(self, files: list[SourceFile]) -> None:
348+
def build_directory_tree(self) -> None:
349349
"""Builds the directory tree for the codebase"""
350350
# Reset and rebuild the directory tree
351351
self.directories = dict()
352-
created_dirs = set()
353-
for file in files:
354-
directory = self.get_directory(file.path.parent, create_on_missing=True)
355-
directory.add_file(file)
356-
file._set_directory(directory)
357-
created_dirs.add(file.path.parent)
358-
359-
def _dir_has_file(filepath):
360-
gen = os.scandir(filepath)
361-
while entry := next(gen, None):
362-
if entry.is_file():
363-
return True
364-
return False
365-
366-
for ctx in self.projects:
367-
for rel_filepath in ctx.repo_operator.get_filepaths_for_repo(GLOBAL_FILE_IGNORE_LIST):
368-
abs_filepath = self.to_absolute(rel_filepath)
369-
if not abs_filepath.is_dir():
370-
abs_filepath = abs_filepath.parent
371-
372-
if abs_filepath not in created_dirs and self.is_subdir(abs_filepath) and _dir_has_file(abs_filepath):
373-
directory = self.get_directory(abs_filepath, create_on_missing=True)
374-
created_dirs.add(abs_filepath)
352+
353+
for file_path, _ in self.projects[0].repo_operator.iter_files(subdirs=self.projects[0].subdirectories, ignore_list=GLOBAL_FILE_IGNORE_LIST):
354+
file_path = Path(file_path)
355+
directory = self.get_directory(file_path.parent, create_on_missing=True)
356+
directory._add_file(file_path.name)
375357

376358
def get_directory(self, directory_path: PathLike, create_on_missing: bool = False, ignore_case: bool = False) -> Directory | None:
377359
"""Returns the directory object for the given path, or None if the directory does not exist.
@@ -399,16 +381,16 @@ def get_directory(self, directory_path: PathLike, create_on_missing: bool = Fals
399381

400382
# Base Case
401383
if str(absolute_path) == str(self.repo_path) or str(absolute_path) == str(parent_path):
402-
root_directory = Directory(path=absolute_path, dirpath="", parent=None)
384+
root_directory = Directory(ctx=self, path=absolute_path, dirpath="")
403385
self.directories[absolute_path] = root_directory
404386
return root_directory
405387

406388
# Recursively create the parent directory
407389
parent = self.get_directory(parent_path, create_on_missing=True)
408390
# Create the directory
409-
directory = Directory(path=absolute_path, dirpath=str(self.to_relative(absolute_path)), parent=parent)
391+
directory = Directory(ctx=self, path=absolute_path, dirpath=str(self.to_relative(absolute_path)))
410392
# Add the directory to the parent
411-
parent.add_subdirectory(directory)
393+
parent._add_subdirectory(directory.name)
412394
# Add the directory to the tree
413395
self.directories[absolute_path] = directory
414396
return directory
@@ -514,7 +496,7 @@ def _process_diff_files(self, files_to_sync: Mapping[SyncType, list[Path]], incr
514496
# Step 6: Build directory tree
515497
logger.info("> Building directory tree")
516498
files = [f for f in sort_editables(self.get_nodes(NodeType.FILE), alphabetical=True, dedupe=False)]
517-
self.build_directory_tree(files)
499+
self.build_directory_tree()
518500

519501
# Step 7: Build configs
520502
if self.config_parser is not None:
@@ -613,13 +595,20 @@ def get_file(self, file_path: os.PathLike, ignore_case: bool = False) -> SourceF
613595
if node_id is not None:
614596
return self.get_node(node_id)
615597
if ignore_case:
616-
parent = self.to_absolute(file_path).parent
617-
if parent == Path(self.repo_path):
618-
for file in self.to_absolute(self.repo_path).iterdir():
619-
if str(file_path).lower() == str(self.to_absolute(file)).lower():
620-
return self.get_file(file, ignore_case=False)
621-
if directory := self.get_directory(parent, ignore_case=ignore_case):
622-
return directory.get_file(os.path.basename(file_path), ignore_case=ignore_case)
598+
# Using `get_directory` so that the case insensitive lookup works
599+
parent = self.get_directory(self.to_absolute(file_path).parent, ignore_case=ignore_case).path
600+
for file in parent.iterdir():
601+
if str(file_path).lower() == str(self.to_relative(file)).lower():
602+
return self.get_file(file, ignore_case=False)
603+
604+
def _get_raw_file_from_path(self, path: Path) -> File | None:
605+
from codegen.sdk.core.file import File
606+
607+
try:
608+
return File.from_content(path, self.io.read_text(path), self, sync=False)
609+
except UnicodeDecodeError:
610+
# Handle when file is a binary file
611+
return File.from_content(path, self.io.read_bytes(path), self, sync=False, binary=True)
623612

624613
def get_external_module(self, module: str, import_name: str) -> ExternalModule | None:
625614
node_id = self._ext_module_idx.get(module + "::" + import_name, None)

src/codegen/sdk/core/codebase.py

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -522,37 +522,24 @@ def get_file(self, filepath: str, *, optional: bool = False, ignore_case: bool =
522522
Raises:
523523
ValueError: If file not found and optional=False.
524524
"""
525-
526-
def get_file_from_path(path: Path) -> File | None:
527-
try:
528-
return File.from_content(path, self.ctx.io.read_text(path), self.ctx, sync=False)
529-
except UnicodeDecodeError:
530-
# Handle when file is a binary file
531-
return File.from_content(path, self.ctx.io.read_bytes(path), self.ctx, sync=False, binary=True)
532-
533525
# Try to get the file from the graph first
534526
file = self.ctx.get_file(filepath, ignore_case=ignore_case)
535527
if file is not None:
536528
return file
529+
# If the file is not in the graph, check the filesystem
537530
absolute_path = self.ctx.to_absolute(filepath)
538-
if absolute_path.suffix in self.ctx.extensions and not self.ctx.io.file_exists(absolute_path):
539-
return None
540531
if self.ctx.io.file_exists(absolute_path):
541-
return get_file_from_path(absolute_path)
542-
elif ignore_case:
543-
parent = absolute_path.parent
544-
if parent == Path(self.ctx.repo_path):
545-
for file in self.ctx.to_absolute(self.ctx.repo_path).iterdir():
546-
if str(absolute_path).lower() == str(file).lower():
547-
return get_file_from_path(file)
548-
else:
549-
dir = self.ctx.get_directory(parent, ignore_case=ignore_case)
550-
if dir is None:
551-
return None
552-
for file in dir.path.iterdir():
553-
if str(absolute_path).lower() == str(file).lower():
554-
return get_file_from_path(file)
555-
elif not optional:
532+
return self.ctx._get_raw_file_from_path(absolute_path)
533+
# If the file is not in the graph, check the filesystem
534+
if absolute_path.parent.exists():
535+
for file in absolute_path.parent.iterdir():
536+
if ignore_case and str(absolute_path).lower() == str(file).lower():
537+
return self.ctx._get_raw_file_from_path(file)
538+
elif not ignore_case and str(absolute_path) == str(file):
539+
return self.ctx._get_raw_file_from_path(file)
540+
541+
# If we get here, the file is not found
542+
if not optional:
556543
msg = f"File {filepath} not found in codebase. Use optional=True to return None instead."
557544
raise ValueError(msg)
558545
return None

0 commit comments

Comments
 (0)