Skip to content

Commit a674dc0

Browse files
authored
Allow primitive file creation, edit, and deletion for non-supported codebase languages. (#542)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 834b600 commit a674dc0

File tree

9 files changed

+198
-14
lines changed

9 files changed

+198
-14
lines changed

src/codegen/git/utils/language.py

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
1+
import logging
12
from collections import Counter
23
from pathlib import Path
34
from typing import Literal
45

56
from codegen.git.utils.file_utils import split_git_path
67
from codegen.shared.enums.programming_language import ProgrammingLanguage
78

9+
logger = logging.getLogger(__name__)
10+
11+
# Minimum ratio of files that must match the dominant language
12+
MIN_LANGUAGE_RATIO = 0.1
13+
814

915
def determine_project_language(folder_path: str, strategy: Literal["most_common", "git_most_common", "package_json"] = "git_most_common") -> ProgrammingLanguage:
1016
"""Determines the primary programming language of a project.
@@ -37,7 +43,8 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
3743
folder_path (str): Path to the folder to analyze
3844
3945
Returns:
40-
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
46+
ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
47+
or if less than MIN_LANGUAGE_RATIO of files match the dominant language
4148
"""
4249
from codegen.sdk.python import PyFile
4350
from codegen.sdk.typescript.file import TSFile
@@ -54,6 +61,7 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
5461

5562
# Initialize counters for each language
5663
language_counts = Counter()
64+
total_files = 0
5765

5866
# Walk through the directory
5967
for file_path in folder.rglob("*"):
@@ -65,17 +73,27 @@ def _determine_language_by_file_count(folder_path: str) -> ProgrammingLanguage:
6573
if any(ignore in str(file_path) for ignore in [".git", "node_modules", "__pycache__", "venv", ".env"]):
6674
continue
6775

76+
total_files += 1
77+
6878
# Count files for each language based on extensions
6979
for language, exts in EXTENSIONS.items():
7080
if file_path.suffix in exts:
7181
language_counts[language] += 1
7282

7383
# If no files found, return None
7484
if not language_counts:
75-
return ProgrammingLanguage.UNSUPPORTED
85+
return ProgrammingLanguage.OTHER
86+
87+
# Get the most common language and its count
88+
most_common_language, count = language_counts.most_common(1)[0]
89+
90+
logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}")
7691

77-
# Return the language with the highest count
78-
return language_counts.most_common(1)[0][0]
92+
# Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
93+
if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO:
94+
return ProgrammingLanguage.OTHER
95+
96+
return most_common_language
7997

8098

8199
def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLanguage:
@@ -86,7 +104,8 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
86104
folder_path (str): Path to the git repo to analyze
87105
88106
Returns:
89-
ProgrammingLanguage: The dominant programming language, or UNSUPPORTED if no matching files found
107+
ProgrammingLanguage: The dominant programming language, or OTHER if no matching files found
108+
or if less than MIN_LANGUAGE_RATIO of files match the dominant language
90109
"""
91110
from codegen.git.repo_operator.repo_operator import RepoOperator
92111
from codegen.git.schemas.repo_config import RepoConfig
@@ -105,6 +124,7 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
105124

106125
# Initialize counters for each language
107126
language_counts = Counter()
127+
total_files = 0
108128

109129
# Initiate RepoOperator
110130
git_root, base_path = split_git_path(folder_path)
@@ -120,17 +140,27 @@ def _determine_language_by_git_file_count(folder_path: str) -> ProgrammingLangua
120140
if file_path.is_dir() or file_path.name.startswith("."):
121141
continue
122142

143+
total_files += 1
144+
123145
# Count files for each language based on extensions
124146
for language, exts in EXTENSIONS.items():
125147
if file_path.suffix in exts:
126148
language_counts[language] += 1
127149

128150
# If no files found, return None
129151
if not language_counts:
130-
return ProgrammingLanguage.UNSUPPORTED
152+
return ProgrammingLanguage.OTHER
153+
154+
# Get the most common language and its count
155+
most_common_language, count = language_counts.most_common(1)[0]
156+
157+
logger.debug(f"Most common language: {most_common_language}, count: {count}, total files: {total_files}")
158+
159+
# Check if the most common language makes up at least MIN_LANGUAGE_RATIO of all files
160+
if total_files > 0 and (count / total_files) < MIN_LANGUAGE_RATIO:
161+
return ProgrammingLanguage.OTHER
131162

132-
# Return the language with the highest count
133-
return language_counts.most_common(1)[0][0]
163+
return most_common_language
134164

135165

136166
def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage:
@@ -145,6 +175,8 @@ def _determine_language_by_package_json(folder_path: str) -> ProgrammingLanguage
145175
"""
146176
package_json_path = Path(folder_path) / "package.json"
147177
if package_json_path.exists():
178+
logger.debug(f"Found package.json at {package_json_path}")
148179
return ProgrammingLanguage.TYPESCRIPT
149180
else:
181+
logger.debug(f"No package.json found at {package_json_path}")
150182
return ProgrammingLanguage.PYTHON

src/codegen/sdk/codebase/codebase_context.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,9 @@ def get_node_classes(programming_language: ProgrammingLanguage) -> NodeClasses:
7575

7676
return TSNodeClasses
7777
else:
78-
msg = f"Unsupported programming language: {programming_language}!"
79-
raise ValueError(msg)
78+
from codegen.sdk.codebase.node_classes.generic_node_classes import GenericNodeClasses
79+
80+
return GenericNodeClasses
8081

8182

8283
class CodebaseContext:
@@ -147,6 +148,7 @@ def __init__(
147148
self.config = config
148149
self.repo_name = context.repo_operator.repo_name
149150
self.repo_path = str(Path(context.repo_operator.repo_path).resolve())
151+
self.full_path = os.path.join(self.repo_path, context.base_path) if context.base_path else self.repo_path
150152
self.codeowners_parser = context.repo_operator.codeowners_parser
151153
self.base_url = context.repo_operator.base_url
152154
# =====[ computed attributes ]=====
@@ -163,6 +165,11 @@ def __init__(
163165
self.language_engine = get_language_engine(context.programming_language, self)
164166
self.programming_language = context.programming_language
165167

168+
# Raise warning if language is not supported
169+
if self.programming_language is ProgrammingLanguage.UNSUPPORTED or self.programming_language is ProgrammingLanguage.OTHER:
170+
logger.warning("WARNING: The codebase is using an unsupported language!")
171+
logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.")
172+
166173
# Build the graph
167174
self.build_graph(context.repo_operator)
168175
try:
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from codegen.sdk.codebase.node_classes.node_classes import NodeClasses
2+
from codegen.sdk.core.class_definition import Class
3+
from codegen.sdk.core.detached_symbols.code_block import CodeBlock
4+
from codegen.sdk.core.detached_symbols.function_call import FunctionCall
5+
from codegen.sdk.core.detached_symbols.parameter import Parameter
6+
from codegen.sdk.core.file import File
7+
from codegen.sdk.core.function import Function
8+
from codegen.sdk.core.import_resolution import Import
9+
from codegen.sdk.core.statements.comment import Comment
10+
11+
GenericNodeClasses = NodeClasses(
12+
file_cls=File,
13+
class_cls=Class,
14+
function_cls=Function,
15+
import_cls=Import,
16+
parameter_cls=Parameter,
17+
comment_cls=Comment,
18+
code_block_cls=CodeBlock,
19+
function_call_cls=FunctionCall,
20+
bool_conversion={},
21+
dynamic_import_parent_types={},
22+
)

src/codegen/sdk/core/codebase.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,15 +258,20 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T
258258
By default, this only returns source files. Setting `extensions='*'` will return all files in the codebase, and
259259
`extensions=[...]` will return all files with the specified extensions.
260260
261-
`extensions='*'` is REQUIRED for listing all non source code files. Or else, codebase.files will ONLY return source files (e.g. .py, .ts).
261+
For Python and Typescript repos WITH file parsing enabled,
262+
`extensions='*'` is REQUIRED for listing all non source code files.
263+
Or else, codebase.files will ONLY return source files (e.g. .py, .ts).
264+
265+
For repos with file parsing disabled or repos with other languages, this will return all files in the codebase.
262266
263267
Returns all Files in the codebase, sorted alphabetically. For Python codebases, returns PyFiles (python files).
264268
For Typescript codebases, returns TSFiles (typescript files).
265269
266270
Returns:
267271
list[TSourceFile]: A sorted list of source files in the codebase.
268272
"""
269-
if extensions is None:
273+
if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0:
274+
# If extensions is None AND there is at least one file in the codebase (This checks for unsupported languages or parse-off repos),
270275
# Return all source files
271276
files = self.ctx.get_nodes(NodeType.FILE)
272277
elif isinstance(extensions, str) and extensions != "*":

src/codegen/sdk/core/file.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,12 @@ def replace(self, old: str, new: str, count: int = -1, is_regex: bool = False, p
398398
else:
399399
return super().replace(old, new, count, is_regex, priority)
400400

401+
@staticmethod
402+
@noapidoc
403+
def get_extensions() -> list[str]:
404+
"""Returns a list of file extensions for the given programming language file."""
405+
return [] # By default, no extensions are "supported" for generic files
406+
401407

402408
TImport = TypeVar("TImport", bound="Import")
403409
TFunction = TypeVar("TFunction", bound="Function")

src/codegen/shared/enums/programming_language.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
class ProgrammingLanguage(StrEnum):
55
TYPESCRIPT = "TYPESCRIPT"
66
PYTHON = "PYTHON"
7+
OTHER = "OTHER"
78
UNSUPPORTED = "UNSUPPORTED"
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from codegen.git.utils.language import determine_project_language
2+
from codegen.sdk.codebase.factory.get_session import get_codebase_session
3+
from codegen.shared.enums.programming_language import ProgrammingLanguage
4+
5+
6+
def test_determine_language_python(tmpdir) -> None:
7+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.py": "", "file3.py": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
8+
# Check for package.json -> False, therefore return PYTHON
9+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
10+
# Check for git_most_common -> PYTHON
11+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON
12+
# Check for most_common -> PYTHON
13+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON
14+
15+
16+
def test_determine_language_typescript(tmpdir) -> None:
17+
with get_codebase_session(tmpdir=tmpdir, files={"file1.ts": "", "file2.ts": "", "file3.ts": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase:
18+
# Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED, even if it's a TS project)
19+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
20+
# Check for git_most_common -> TYPESCRIPT
21+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.TYPESCRIPT
22+
# Check for most_common -> TYPESCRIPT
23+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.TYPESCRIPT
24+
25+
26+
def test_determine_language_other(tmpdir) -> None:
27+
with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": "", "file2.txt": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.OTHER) as codebase:
28+
# Check for package.json -> False, therefore return PYTHON (THIS IS EXPECTED)
29+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
30+
# Check for git_most_common -> OTHER
31+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
32+
# Check for most_common -> OTHER
33+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER
34+
35+
36+
def test_determine_language_package_json(tmpdir) -> None:
37+
with get_codebase_session(tmpdir=tmpdir, files={"package.json": ""}, programming_language=ProgrammingLanguage.TYPESCRIPT) as codebase:
38+
# Check for package.json -> True, therefore return Typescript
39+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.TYPESCRIPT
40+
# Check for git_most_common -> OTHER
41+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
42+
# Check for most_common -> OTHER
43+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER
44+
45+
46+
def test_determine_language_mixed(tmpdir) -> None:
47+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "", "file2.ts": "", "file3.txt": ""}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
48+
# Check for package.json -> False, therefore return PYTHON
49+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
50+
# Check for git_most_common -> PYTHON
51+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.PYTHON
52+
# Check for most_common -> PYTHON
53+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON
54+
55+
56+
def test_determine_language_threshold(tmpdir) -> None:
57+
with get_codebase_session(tmpdir=tmpdir, files={"file0.py": ""} | {f"file{i}.txt": "" for i in range(1, 20)}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
58+
# Check for package.json -> False, therefore return PYTHON
59+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
60+
# Check for git_most_common -> OTHER
61+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
62+
# Check for most_common -> OTHER
63+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.OTHER
64+
65+
66+
def test_determine_language_gitignore(tmpdir) -> None:
67+
with get_codebase_session(tmpdir=tmpdir, files={"dir/file1.py": "", "dir/file2.py": "", "dir/file3.py": "", ".gitignore": "dir"}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
68+
# Check for package.json -> False, therefore return PYTHON
69+
assert determine_project_language(tmpdir, strategy="package_json") == ProgrammingLanguage.PYTHON
70+
# Check for git_most_common -> OTHER (follows gitignore, therefore finds no files)
71+
assert determine_project_language(tmpdir, strategy="git_most_common") == ProgrammingLanguage.OTHER
72+
# Check for most_common -> PYTHON (ignores gitignore)
73+
assert determine_project_language(tmpdir, strategy="most_common") == ProgrammingLanguage.PYTHON

tests/unit/codegen/sdk/codebase/file/test_file.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from codegen.sdk.codebase.factory.get_session import get_codebase_session
66
from codegen.sdk.core.file import File, SourceFile
7+
from codegen.shared.enums.programming_language import ProgrammingLanguage
78

89

910
def test_file(tmpdir) -> None:
@@ -59,6 +60,28 @@ def test_codebase_files(tmpdir) -> None:
5960
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}
6061

6162

63+
def test_codebase_files_other_language(tmpdir) -> None:
64+
with get_codebase_session(
65+
tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER
66+
) as codebase:
67+
file1 = codebase.get_file("file1.py")
68+
file2 = codebase.get_file("file2.py")
69+
file3 = codebase.get_file("file3.bin")
70+
file4 = codebase.get_file("file4")
71+
72+
assert len(codebase.files) == 4 # Match all files if the language is OTHER
73+
assert {f for f in codebase.files} == {file1, file2, file3, file4}
74+
75+
assert len(codebase.files(extensions="*")) == 4
76+
assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4}
77+
78+
assert len(codebase.files(extensions=[".py"])) == 2
79+
assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2}
80+
81+
assert len(codebase.files(extensions=[".bin"])) == 1
82+
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}
83+
84+
6285
@pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive")
6386
def test_file_extensions_ignore_case(tmpdir) -> None:
6487
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}) as codebase:

tests/unit/codegen/sdk/python/codebase/test_codebase_reset.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,23 @@ def test_codebase_reset_gitignore(tmpdir: str) -> None:
129129
def square(x: a):
130130
return x * x
131131
"""
132-
with get_codebase_session(tmpdir=tmpdir, files={"dir/file0.py": file0_content, ".gitignore": gitignore_content}, programming_language=ProgrammingLanguage.PYTHON) as codebase:
133-
assert len(codebase.files) == 0
132+
file1_content = """
133+
from dir.file0 import square
134+
135+
class MyClass:
136+
def foo(self, arg1, arg2):
137+
return arg1 + square(arg2)
138+
"""
139+
with get_codebase_session(
140+
tmpdir=tmpdir,
141+
files={
142+
"dir/file0.py": file0_content,
143+
"dir/file1.py": file1_content,
144+
".gitignore": gitignore_content,
145+
},
146+
programming_language=ProgrammingLanguage.PYTHON,
147+
) as codebase:
148+
assert len(codebase.files) == 1
134149
codebase.reset()
135150
codebase.checkout(branch="test-branch", create_if_missing=True)
136151
codebase.commit(sync_graph=True)

0 commit comments

Comments
 (0)