Skip to content

Add git-based codebase language detection #539

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 19, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 80 additions & 19 deletions src/codegen/git/utils/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,31 @@
from pathlib import Path
from typing import Literal

from codegen.git.utils.file_utils import split_git_path
from codegen.shared.enums.programming_language import ProgrammingLanguage


def determine_project_language(folder_path: str, strategy: Literal["most_common", "package_json"] = "package_json") -> ProgrammingLanguage:
def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be an Enum please?

"""Determines the primary programming language of a project.

Args:
folder_path (str): Path to the folder to analyze
strategy (Literal["most_common", "package_json"]): Strategy to use for determining language.
"most_common" analyzes file extensions, "package_json" checks for package.json presence.
strategy (Literal["most_common", "git_most_common", "package_json"]): Strategy to use for determining language.
"most_common" analyzes file extensions, "git_most_common" analyzes files in the git repo, "package_json" checks for package.json presence.

Returns:
ProgrammingLanguage: The determined programming language
"""
# TODO: Create a new strategy that follows gitignore
if strategy == "most_common":
return _determine_language_by_file_count(folder_path)
elif strategy == "git_most_common":
return _determine_language_by_git_file_count(folder_path)
elif strategy == "package_json":
return _determine_language_by_package_json(folder_path)


def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
"""Determines project language by checking for presence of package.json.
Faster but less accurate than file count strategy.

Args:
folder_path (str): Path to the folder to analyze

Returns:
ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON
"""
package_json_path = Path(folder_path) / "package.json"
if package_json_path.exists():
return ProgrammingLanguage.TYPESCRIPT
else:
return ProgrammingLanguage.PYTHON
msg = f"Invalid strategy: {strategy}"
raise ValueError(msg)

Check warning on line 29 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L28-L29

Added lines #L28 - L29 were not covered by tests


def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
Expand All @@ -64,7 +53,7 @@
raise ValueError(msg)

# Initialize counters for each language
language_counts = Counter()

Check failure on line 56 in src/codegen/git/utils/language.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "language_counts" [var-annotated]

# Walk through the directory
for file_path in folder.rglob("*"):
Expand All @@ -87,3 +76,75 @@

# Return the language with the highest count
return language_counts.most_common(1)[0][0]


def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage:
"""Analyzes a git repo to determine the primary programming language based on file extensions.
Returns the language with the most matching files.

Args:
folder_path (str): Path to the git repo to analyze

Returns:
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
"""
from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator
from codegen.git.schemas.repo_config import RepoConfig
from codegen.sdk.python import PyFile
from codegen.sdk.typescript.file import TSFile

EXTENSIONS = {
ProgrammingLanguage.PYTHON: PyFile.get_extensions(),
ProgrammingLanguage.TYPESCRIPT: TSFile.get_extensions(),
}

folder = Path(folder_path)
if not folder.exists() or not folder.is_dir():
msg = f"Invalid folder path: {folder_path}"
raise ValueError(msg)

Check warning on line 104 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L103-L104

Added lines #L103 - L104 were not covered by tests

# Initialize counters for each language
language_counts = Counter()

Check failure on line 107 in src/codegen/git/utils/language.py

View workflow job for this annotation

GitHub Actions / mypy

error: Need type annotation for "language_counts" [var-annotated]

# Initiate LocalRepoOperator
git_root, base_path = split_git_path(folder_path)
repo_config = RepoConfig.from_repo_path(repo_path=git_root)
repo_operator = LocalRepoOperator(repo_config=repo_config)

# Walk through the directory
for rel_path, _ in repo_operator.iter_files(subdirs=[base_path] if base_path else None):
# Convert to Path object
file_path = Path(git_root) / Path(rel_path)

# Skip directories and hidden files
if file_path.is_dir() or file_path.name.startswith("."):
continue

Check warning on line 121 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L121

Added line #L121 was not covered by tests

# Count files for each language based on extensions
for language, exts in EXTENSIONS.items():
if file_path.suffix in exts:
language_counts[language] += 1

# If no files found, return None
if not language_counts:
return ProgrammingLanguage.UNSUPPORTED

Check warning on line 130 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L130

Added line #L130 was not covered by tests

# Return the language with the highest count
return language_counts.most_common(1)[0][0]


def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
"""Determines project language by checking for presence of package.json.
Faster but less accurate than file count strategy.

Args:
folder_path (str): Path to the folder to analyze

Returns:
ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON
"""
package_json_path = Path(folder_path) / "package.json"
if package_json_path.exists():
return ProgrammingLanguage.TYPESCRIPT

Check warning on line 148 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L146-L148

Added lines #L146 - L148 were not covered by tests
else:
return ProgrammingLanguage.PYTHON

Check warning on line 150 in src/codegen/git/utils/language.py

View check run for this annotation

Codecov / codecov/patch

src/codegen/git/utils/language.py#L150

Added line #L150 was not covered by tests
Loading