|
2 | 2 | from pathlib import Path
|
3 | 3 | from typing import Literal
|
4 | 4 |
|
| 5 | +from codegen.git.utils.file_utils import split_git_path |
5 | 6 | from codegen.shared.enums.programming_language import ProgrammingLanguage
|
6 | 7 |
|
7 | 8 |
|
8 |
| -def determine_project_language(folder_path: str, strategy: Literal["most_common", "package_json"] = "package_json") -> ProgrammingLanguage: |
| 9 | +def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage: |
9 | 10 | """Determines the primary programming language of a project.
|
10 | 11 |
|
11 | 12 | Args:
|
12 | 13 | folder_path (str): Path to the folder to analyze
|
13 |
| - strategy (Literal["most_common", "package_json"]): Strategy to use for determining language. |
14 |
| - "most_common" analyzes file extensions, "package_json" checks for package.json presence. |
| 14 | + strategy (Literal["most_common", "git_most_common", "package_json"]): Strategy to use for determining language. |
| 15 | + "most_common" analyzes file extensions, "git_most_common" analyzes files in the git repo, "package_json" checks for package.json presence. |
15 | 16 |
|
16 | 17 | Returns:
|
17 | 18 | ProgrammingLanguage: The determined programming language
|
18 | 19 | """
|
19 | 20 | # TODO: Create a new strategy that follows gitignore
|
20 | 21 | if strategy == "most_common":
|
21 | 22 | return _determine_language_by_file_count(folder_path)
|
| 23 | + elif strategy == "git_most_common": |
| 24 | + return _determine_language_by_git_file_count(folder_path) |
22 | 25 | elif strategy == "package_json":
|
23 | 26 | return _determine_language_by_package_json(folder_path)
|
24 |
| - |
25 |
| - |
26 |
| -def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage: |
27 |
| - """Determines project language by checking for presence of package.json. |
28 |
| - Faster but less accurate than file count strategy. |
29 |
| -
|
30 |
| - Args: |
31 |
| - folder_path (str): Path to the folder to analyze |
32 |
| -
|
33 |
| - Returns: |
34 |
| - ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON |
35 |
| - """ |
36 |
| - package_json_path = Path(folder_path) / "package.json" |
37 |
| - if package_json_path.exists(): |
38 |
| - return ProgrammingLanguage.TYPESCRIPT |
39 | 27 | else:
|
40 |
| - return ProgrammingLanguage.PYTHON |
| 28 | + msg = f"Invalid strategy: {strategy}" |
| 29 | + raise ValueError(msg) |
41 | 30 |
|
42 | 31 |
|
43 | 32 | def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
|
@@ -87,3 +76,75 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
|
87 | 76 |
|
88 | 77 | # Return the language with the highest count
|
89 | 78 | return language_counts.most_common(1)[0][0]
|
| 79 | + |
| 80 | + |
| 81 | +def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage: |
| 82 | + """Analyzes a git repo to determine the primary programming language based on file extensions. |
| 83 | + Returns the language with the most matching files. |
| 84 | +
|
| 85 | + Args: |
| 86 | + folder_path (str): Path to the git repo to analyze |
| 87 | +
|
| 88 | + Returns: |
| 89 | + ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found |
| 90 | + """ |
| 91 | + from codegen.git.repo_operator.local_repo_operator import LocalRepoOperator |
| 92 | + from codegen.git.schemas.repo_config import RepoConfig |
| 93 | + from codegen.sdk.python import PyFile |
| 94 | + from codegen.sdk.typescript.file import TSFile |
| 95 | + |
| 96 | + EXTENSIONS = { |
| 97 | + ProgrammingLanguage.PYTHON: PyFile.get_extensions(), |
| 98 | + ProgrammingLanguage.TYPESCRIPT: TSFile.get_extensions(), |
| 99 | + } |
| 100 | + |
| 101 | + folder = Path(folder_path) |
| 102 | + if not folder.exists() or not folder.is_dir(): |
| 103 | + msg = f"Invalid folder path: {folder_path}" |
| 104 | + raise ValueError(msg) |
| 105 | + |
| 106 | + # Initialize counters for each language |
| 107 | + language_counts = Counter() |
| 108 | + |
| 109 | + # Initiate LocalRepoOperator |
| 110 | + git_root, base_path = split_git_path(folder_path) |
| 111 | + repo_config = RepoConfig.from_repo_path(repo_path=git_root) |
| 112 | + repo_operator = LocalRepoOperator(repo_config=repo_config) |
| 113 | + |
| 114 | + # Walk through the directory |
| 115 | + for rel_path, _ in repo_operator.iter_files(subdirs=[base_path] if base_path else None): |
| 116 | + # Convert to Path object |
| 117 | + file_path = Path(git_root) / Path(rel_path) |
| 118 | + |
| 119 | + # Skip directories and hidden files |
| 120 | + if file_path.is_dir() or file_path.name.startswith("."): |
| 121 | + continue |
| 122 | + |
| 123 | + # Count files for each language based on extensions |
| 124 | + for language, exts in EXTENSIONS.items(): |
| 125 | + if file_path.suffix in exts: |
| 126 | + language_counts[language] += 1 |
| 127 | + |
| 128 | + # If no files found, return None |
| 129 | + if not language_counts: |
| 130 | + return ProgrammingLanguage.UNSUPPORTED |
| 131 | + |
| 132 | + # Return the language with the highest count |
| 133 | + return language_counts.most_common(1)[0][0] |
| 134 | + |
| 135 | + |
| 136 | +def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage: |
| 137 | + """Determines project language by checking for presence of package.json. |
| 138 | + Faster but less accurate than file count strategy. |
| 139 | +
|
| 140 | + Args: |
| 141 | + folder_path (str): Path to the folder to analyze |
| 142 | +
|
| 143 | + Returns: |
| 144 | + ProgrammingLanguage: TYPESCRIPT if package.json exists, otherwise PYTHON |
| 145 | + """ |
| 146 | + package_json_path = Path(folder_path) / "package.json" |
| 147 | + if package_json_path.exists(): |
| 148 | + return ProgrammingLanguage.TYPESCRIPT |
| 149 | + else: |
| 150 | + return ProgrammingLanguage.PYTHON |
0 commit comments