Skip to content

Allow primitive file creation, edit, and deletion for non-supported codebase languages. #542

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions src/codegen/git/utils/language.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import logging
from collections import Counter
from pathlib import Path
from typing import Literal

from codegen.git.utils.file_utils import split_git_path
from codegen.shared.enums.programming_language import ProgrammingLanguage

logger = logging.getLogger(__name__)

# Minimum ratio of files that must match the dominant language
MIN_LANGUAGE_RATIO = 0.1


def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage:
"""Determines the primary programming language of a project.
Expand Down Expand Up @@ -37,7 +43,8 @@
folder_path (str): Path to the folder to analyze

Returns:
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
or if less than MIN_LANGUAGE_RATIO of files match the dominant language
"""
from codegen.sdk.python import PyFile
from codegen.sdk.typescript.file import TSFile
Expand All @@ -53,7 +60,8 @@
raise ValueError(msg)

# Initialize counters for each language
language_counts = Counter()

Check failure on line 63 in src/codegen/git/utils/language.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "language_counts" [var-annotated]
total_files = 0

# Walk through the directory
for file_path in folder.rglob("*"):
Expand All @@ -65,17 +73,27 @@
if any(ignore in str(file_path) for ignore in [".git", "node_modules", "__pycache__", "venv", ".env"]):
continue

total_files += 1

# Count files for each language based on extensions
for language, exts in EXTENSIONS.items():
if file_path.suffix in exts:
language_counts[language] += 1

# If no files found, return None
if not language_counts:
return ProgrammingLanguage.UNSUPPORTED
return ProgrammingLanguage.OTHER

# Get the most common language and its count
most_common_language, count = language_counts.most_common(1)[0]

logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}")

# Return the language with the highest count
return language_counts.most_common(1)[0][0]
# Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO:
return ProgrammingLanguage.OTHER

return most_common_language


def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage:
Expand All @@ -86,7 +104,8 @@
folder_path (str): Path to the git repo to analyze

Returns:
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
or if less than MIN_LANGUAGE_RATIO of files match the dominant language
"""
from codegen.git.repo_operator.repo_operator import RepoOperator
from codegen.git.schemas.repo_config import RepoConfig
Expand All @@ -104,7 +123,8 @@
raise ValueError(msg)

# Initialize counters for each language
language_counts = Counter()

Check failure on line 126 in src/codegen/git/utils/language.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "language_counts" [var-annotated]
total_files = 0

# Initiate RepoOperator
git_root, base_path = split_git_path(folder_path)
Expand All @@ -120,17 +140,27 @@
if file_path.is_dir() or file_path.name.startswith("."):
continue

total_files += 1

# Count files for each language based on extensions
for language, exts in EXTENSIONS.items():
if file_path.suffix in exts:
language_counts[language] += 1

# If no files found, return None
if not language_counts:
return ProgrammingLanguage.UNSUPPORTED
return ProgrammingLanguage.OTHER

# Get the most common language and its count
most_common_language, count = language_counts.most_common(1)[0]

logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}")

# Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO:
return ProgrammingLanguage.OTHER

# Return the language with the highest count
return language_counts.most_common(1)[0][0]
return most_common_language


def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
Expand All @@ -145,6 +175,8 @@
"""
package_json_path = Path(folder_path) / "package.json"
if package_json_path.exists():
logger.debug(f"Found package.json at {package_json_path}")
return ProgrammingLanguage.TYPESCRIPT
else:
logger.debug(f"No package.json found at {package_json_path}")
return ProgrammingLanguage.PYTHON
11 changes: 9 additions & 2 deletions src/codegen/sdk/codebase/codebase_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@

return TSNodeClasses
else:
msg = f"Unsupported programming language: {programming_language}!"
raise ValueError(msg)
from codegen.sdk.codebase.node_classes.generic_node_classes import GenericNodeClasses

return GenericNodeClasses


class CodebaseContext:
Expand Down Expand Up @@ -147,6 +148,7 @@
self.config = config
self.repo_name = context.repo_operator.repo_name
self.repo_path = str(Path(context.repo_operator.repo_path).resolve())
self.full_path = os.path.join(self.repo_path, context.base_path) if context.base_path else self.repo_path
self.codeowners_parser = context.repo_operator.codeowners_parser
self.base_url = context.repo_operator.base_url
# =====[ computed attributes ]=====
Expand All @@ -163,6 +165,11 @@
self.language_engine = get_language_engine(context.programming_language, self)
self.programming_language = context.programming_language

# Raise warning if language is not supported
if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER:
logger.warning("WARNING: The codebase is using an unsupported language!")
logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.")

# Build the graph
self.build_graph(context.repo_operator)
try:
Expand Down Expand Up @@ -190,7 +197,7 @@
syncs[SyncType.ADD].append(self.to_absolute(filepath))
logger.info(f"> Parsing {len(syncs[SyncType.ADD])} files in {self.projects[0].subdirectories or 'ALL'} subdirectories with {self.extensions} extensions")
self._process_diff_files(syncs, incremental=False)
files: list[SourceFile] = self.get_nodes(NodeType.FILE)

Check failure on line 200 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[SourceFile[Any, Any, Any, Any, Any, Any]]") [assignment]
logger.info(f"> Found {len(files)} files")
logger.info(f"> Found {len(self.nodes)} nodes and {len(self.edges)} edges")
if self.config.feature_flags.track_graph:
Expand Down Expand Up @@ -220,8 +227,8 @@
elif diff.change_type == ChangeType.Modified:
files_to_sync[filepath] = SyncType.REPARSE
elif diff.change_type == ChangeType.Renamed:
files_to_sync[diff.rename_from] = SyncType.DELETE

Check failure on line 230 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
files_to_sync[diff.rename_to] = SyncType.ADD

Check failure on line 231 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Invalid index type "Path | None" for "dict[Path, SyncType]"; expected type "Path" [index]
elif diff.change_type == ChangeType.Removed:
files_to_sync[filepath] = SyncType.DELETE
else:
Expand Down Expand Up @@ -258,16 +265,16 @@
files_to_write.append((sync.path, sync.old_content))
modified_files.add(sync.path)
elif sync.change_type == ChangeType.Renamed:
files_to_write.append((sync.rename_from, sync.old_content))

Check failure on line 268 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "append" of "list" has incompatible type "tuple[Path | None, bytes | None]"; expected "tuple[Path, bytes | None]" [arg-type]
files_to_remove.append(sync.rename_to)
modified_files.add(sync.rename_from)

Check failure on line 270 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
modified_files.add(sync.rename_to)

Check failure on line 271 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "add" of "set" has incompatible type "Path | None"; expected "Path" [arg-type]
elif sync.change_type == ChangeType.Added:
files_to_remove.append(sync.path)
modified_files.add(sync.path)
logger.info(f"Writing {len(files_to_write)} files to disk and removing {len(files_to_remove)} files")
for file in files_to_remove:
self.io.delete_file(file)

Check failure on line 277 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "delete_file" of "IO" has incompatible type "Path | None"; expected "Path" [arg-type]
to_save = set()
for file, content in files_to_write:
self.io.write_file(file, content)
Expand Down Expand Up @@ -318,7 +325,7 @@
for module in external_modules:
if not any(self.predecessors(module.node_id)):
self.remove_node(module.node_id)
self._ext_module_idx.pop(module._idx_key, None)

Check failure on line 328 in src/codegen/sdk/codebase/codebase_context.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Importable[Any]" has no attribute "_idx_key" [attr-defined]

def build_directory_tree(self, files: list[SourceFile]) -> None:
"""Builds the directory tree for the codebase"""
Expand Down
22 changes: 22 additions & 0 deletions src/codegen/sdk/codebase/node_classes/generic_node_classes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from codegen.sdk.codebase.node_classes.node_classes import NodeClasses
from codegen.sdk.core.class_definition import Class
from codegen.sdk.core.detached_symbols.code_block import CodeBlock
from codegen.sdk.core.detached_symbols.function_call import FunctionCall
from codegen.sdk.core.detached_symbols.parameter import Parameter
from codegen.sdk.core.file import File
from codegen.sdk.core.function import Function
from codegen.sdk.core.import_resolution import Import
from codegen.sdk.core.statements.comment import Comment

GenericNodeClasses = NodeClasses(
file_cls=File,
class_cls=Class,
function_cls=Function,
import_cls=Import,
parameter_cls=Parameter,
comment_cls=Comment,
code_block_cls=CodeBlock,
function_call_cls=FunctionCall,
bool_conversion={},
dynamic_import_parent_types={},
)
9 changes: 7 additions & 2 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,15 +257,20 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T
By default, this only returns source files. Setting `extensions='*'` will return all files in the codebase, and
`extensions=[...]` will return all files with the specified extensions.

`extensions='*'` is REQUIRED for listing all non source code files. Or else, codebase.files will ONLY return source files (e.g. .py, .ts).
For Python and Typescript repos WITH file parsing enabled,
`extensions='*'` is REQUIRED for listing all non source code files.
Or else, codebase.files will ONLY return source files (e.g. .py, .ts).

For repos with file parsing disabled or repos with other languages, this will return all files in the codebase.

Returns all Files in the codebase, sorted alphabetically. For Python codebases, returns PyFiles (python files).
For Typescript codebases, returns TSFiles (typescript files).

Returns:
list[TSourceFile]: A sorted list of source files in the codebase.
"""
if extensions is None:
if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0:
# If extensions is None AND there is at least one file in the codebase (This checks for unsupported languages or parse-off repos),
# Return all source files
files = self.ctx.get_nodes(NodeType.FILE)
elif isinstance(extensions, str) and extensions != "*":
Expand Down
6 changes: 6 additions & 0 deletions src/codegen/sdk/core/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,12 @@ def replace(self, old: str, new: str, count: int = -1, is_regex: bool = False, p
else:
return super().replace(old, new, count, is_regex, priority)

@staticmethod
@noapidoc
def get_extensions() -> list[str]:
"""Returns a list of file extensions for the given programming language file."""
return [] # By default, no extensions are "supported" for generic files


TImport = TypeVar("TImport", bound="Import")
TFunction = TypeVar("TFunction", bound="Function")
Expand Down
1 change: 1 addition & 0 deletions src/codegen/shared/enums/programming_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
class ProgrammingLanguage(StrEnum):
TYPESCRIPT = "TYPESCRIPT"
PYTHON = "PYTHON"
OTHER = "OTHER"
UNSUPPORTED = "UNSUPPORTED"
73 changes: 73 additions & 0 deletions tests/unit/codegen/git/utils/test_language_detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from codegen.git.utils.language import determine_project_language
from codegen.sdk.codebase.factory.get_session import get_codebase_session
from codegen.shared.enums.programming_language import ProgrammingLanguage


def test_determine_language_python(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.py": "", "file3.py": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
# Check for package.json -> False, therefore return PYTHON
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> PYTHON
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON
# Check for most_common -> PYTHON
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON


def test_determine_language_typescript(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.ts": "", "file2.ts": "", "file3.ts": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase:
# Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED, even if it's a TS project)
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> TYPESCRIPT
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT
# Check for most_common -> TYPESCRIPT
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT


def test_determine_language_other(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": "", "file2.txt": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.OTHER) as codebase:
# Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED)
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> OTHER
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
# Check for most_common -> OTHER
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER


def test_determine_language_package_json(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"package.json": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase:
# Check for package.json -> True, therefore return Typescript
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT
# Check for git_most_common -> OTHER
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
# Check for most_common -> OTHER
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER


def test_determine_language_mixed(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.ts": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
# Check for package.json -> False, therefore return PYTHON
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> PYTHON
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON
# Check for most_common -> PYTHON
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON


def test_determine_language_threshold(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file0.py": ""} | {f"file{i}.txt": "" for i in range(1, 20)}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
# Check for package.json -> False, therefore return PYTHON
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> OTHER
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
# Check for most_common -> OTHER
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER


def test_determine_language_gitignore(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir"}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
# Check for package.json -> False, therefore return PYTHON
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
# Check for git_most_common -> OTHER (follows gitignore, therefore finds no files)
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
# Check for most_common -> PYTHON (ignores gitignore)
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON
23 changes: 23 additions & 0 deletions tests/unit/codegen/sdk/codebase/file/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from codegen.sdk.codebase.factory.get_session import get_codebase_session
from codegen.sdk.core.file import File, SourceFile
from codegen.shared.enums.programming_language import ProgrammingLanguage


def test_file(tmpdir) -> None:
Expand Down Expand Up @@ -59,6 +60,28 @@ def test_codebase_files(tmpdir) -> None:
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}


def test_codebase_files_other_language(tmpdir) -> None:
with get_codebase_session(
tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER
) as codebase:
file1 = codebase.get_file("file1.py")
file2 = codebase.get_file("file2.py")
file3 = codebase.get_file("file3.bin")
file4 = codebase.get_file("file4")

assert len(codebase.files) == 4 # Match all files if the language is OTHER
assert {f for f in codebase.files} == {file1, file2, file3, file4}

assert len(codebase.files(extensions="*")) == 4
assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4}

assert len(codebase.files(extensions=[".py"])) == 2
assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2}

assert len(codebase.files(extensions=[".bin"])) == 1
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}


@pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive")
def test_file_extensions_ignore_case(tmpdir) -> None:
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}) as codebase:
Expand Down
19 changes: 17 additions & 2 deletions tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,23 @@ def test_codebase_reset_gitignore(tmpdir: str) -> None:
def square(x: a):
return x * x
"""
with get_codebase_session(tmpdir=tmpdir, files={"dir/file0.py": file0_content, ".gitignore": gitignore_content}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
assert len(codebase.files) == 0
file1_content = """
from dir.file0 import square

class MyClass:
def foo(self, arg1, arg2):
return arg1 + square(arg2)
"""
with get_codebase_session(
tmpdir=tmpdir,
files={
"dir/file0.py": file0_content,
"dir/file1.py": file1_content,
".gitignore": gitignore_content,
},
programming_language=ProgrammingLanguage.PYTHON,
) as codebase:
assert len(codebase.files) == 1
codebase.reset()
codebase.checkout(branch="test-branch", create_if_missing=True)
codebase.commit(sync_graph=True)
Expand Down