Skip to content

Commit b91cdd8

Browse files
authored
Add git-based codebase language detection (#539)
# Motivation Legacy `most_common` strategy is slow and unnecessarily scans through `node_modules`, current `package_json` strategy is flakey. The new `git_most_common` strategy aims to address both of these issues. # Content Adds `git_most_common` strategy to `determine_project_language` ![image](https://github.com/user-attachments/assets/39ba0bc9-43bf-4039-a660-75da2252aa72) # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 61964af commit b91cdd8

File tree

1 file changed

+80
-19
lines changed

1 file changed

+80
-19
lines changed

src/codegen/git/utils/language.py

Lines changed: 80 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,42 +2,31 @@
22
from pathlib import Path
33
from typing import Literal
44

5+
from codegen.git.utils.file_utils import split_git_path
56
from codegen.shared.enums.programming_language import ProgrammingLanguage
67

78

8-
def determine_project_language(folder_path: str, strategy: Literal["most_common", "package_json"] = "package_json") -> ProgrammingLanguage:
9+
def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage:
910
"""Determines the primary programming language of a project.
1011
1112
Args:
1213
folder_path (str): Path to the folder to analyze
13-
strategy (Literal["most_common", "package_json"]): Strategy to use for determining language.
14-
"most_common" analyzes file extensions, "package_json" checks for package.json presence.
14+
strategy (Literal["most_common", "git_most_common", "package_json"]): Strategy to use for determining language.
15+
"most_common" analyzes file extensions, "git_most_common" analyzes files in the git repo, "package_json" checks for package.json presence.
1516
1617
Returns:
1718
ProgrammingLanguage: The determined programming language
1819
"""
1920
# TODO: Create a new strategy that follows gitignore
2021
if strategy == "most_common":
2122
return _determine_language_by_file_count(folder_path)
23+
elif strategy == "git_most_common":
24+
return _determine_language_by_git_file_count(folder_path)
2225
elif strategy == "package_json":
2326
return _determine_language_by_package_json(folder_path)
24-
25-
26-
def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
27-
"""Determines project language by checking for presence of package.json.
28-
Faster but less accurate than file count strategy.
29-
30-
Args:
31-
folder_path (str): Path to the folder to analyze
32-
33-
Returns:
34-
ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON
35-
"""
36-
package_json_path = Path(folder_path) / "package.json"
37-
if package_json_path.exists():
38-
return ProgrammingLanguage.TYPESCRIPT
3927
else:
40-
return ProgrammingLanguage.PYTHON
28+
msg = f"Invalid strategy: {strategy}"
29+
raise ValueError(msg)
4130

4231

4332
def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
@@ -87,3 +76,75 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
8776

8877
# Return the language with the highest count
8978
return language_counts.most_common(1)[0][0]
79+
80+
81+
def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage:
82+
"""Analyzes a git repo to determine the primary programming language based on file extensions.
83+
Returns the language with the most matching files.
84+
85+
Args:
86+
folder_path (str): Path to the git repo to analyze
87+
88+
Returns:
89+
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
90+
"""
91+
from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator
92+
from codegen.git.schemas.repo_config import RepoConfig
93+
from codegen.sdk.python import PyFile
94+
from codegen.sdk.typescript.file import TSFile
95+
96+
EXTENSIONS = {
97+
ProgrammingLanguage.PYTHON: PyFile.get_extensions(),
98+
ProgrammingLanguage.TYPESCRIPT: TSFile.get_extensions(),
99+
}
100+
101+
folder = Path(folder_path)
102+
if not folder.exists() or not folder.is_dir():
103+
msg = f"Invalid folder path: {folder_path}"
104+
raise ValueError(msg)
105+
106+
# Initialize counters for each language
107+
language_counts = Counter()
108+
109+
# Initiate LocalRepoOperator
110+
git_root, base_path = split_git_path(folder_path)
111+
repo_config = RepoConfig.from_repo_path(repo_path=git_root)
112+
repo_operator = LocalRepoOperator(repo_config=repo_config)
113+
114+
# Walk through the directory
115+
for rel_path, _ in repo_operator.iter_files(subdirs=[base_path] if base_path else None):
116+
# Convert to Path object
117+
file_path = Path(git_root) / Path(rel_path)
118+
119+
# Skip directories and hidden files
120+
if file_path.is_dir() or file_path.name.startswith("."):
121+
continue
122+
123+
# Count files for each language based on extensions
124+
for language, exts in EXTENSIONS.items():
125+
if file_path.suffix in exts:
126+
language_counts[language] += 1
127+
128+
# If no files found, return None
129+
if not language_counts:
130+
return ProgrammingLanguage.UNSUPPORTED
131+
132+
# Return the language with the highest count
133+
return language_counts.most_common(1)[0][0]
134+
135+
136+
def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
137+
"""Determines project language by checking for presence of package.json.
138+
Faster but less accurate than file count strategy.
139+
140+
Args:
141+
folder_path (str): Path to the folder to analyze
142+
143+
Returns:
144+
ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON
145+
"""
146+
package_json_path = Path(folder_path) / "package.json"
147+
if package_json_path.exists():
148+
return ProgrammingLanguage.TYPESCRIPT
149+
else:
150+
return ProgrammingLanguage.PYTHON

0 commit comments

Comments
 (0)