Skip to content

Commit 7e2c9ef

Browse files
authored
Support Codebase.from_string and Codebase.from_files (#630)
**Codebase.from_string** Parameters - `code: str` -> pass in the code you want to parse - `language: str | ProgrammingLanguage` -> pass in the programming language of the code Raises error if programming language is not parsed **Codebase.from_files** Parameters - `files: dict[str, str]` -> pass in dict of {filename: code, filename: code} - `language str | ProgrammingLanguage | None` -> pass in programming language of the code - uses .ts, .js, .tsx, .jsx or .py to figure out language support of language not passed in - will not allow mismatched extensions (i.e test.py, and test.ts) ---------
1 parent 33d8df5 commit 7e2c9ef

File tree

3 files changed

+273
-0
lines changed

3 files changed

+273
-0
lines changed

src/codegen/sdk/core/codebase.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import os
77
import re
8+
import tempfile
89
from collections.abc import Generator
910
from contextlib import contextmanager
1011
from functools import cached_property
@@ -1298,6 +1299,135 @@ def from_repo(
12981299
logger.exception(f"Failed to initialize codebase: {e}")
12991300
raise
13001301

1302+
@classmethod
1303+
def from_string(
1304+
cls,
1305+
code: str,
1306+
*,
1307+
language: Literal["python", "typescript"] | ProgrammingLanguage,
1308+
) -> "Codebase":
1309+
"""Creates a Codebase instance from a string of code.
1310+
1311+
Args:
1312+
code: String containing code
1313+
language: Language of the code. Defaults to Python.
1314+
1315+
Returns:
1316+
Codebase: A Codebase instance initialized with the provided code
1317+
1318+
Example:
1319+
>>> # Python code
1320+
>>> code = "def add(a, b): return a + b"
1321+
>>> codebase = Codebase.from_string(code, language="python")
1322+
1323+
>>> # TypeScript code
1324+
>>> code = "function add(a: number, b: number): number { return a + b; }"
1325+
>>> codebase = Codebase.from_string(code, language="typescript")
1326+
"""
1327+
if not language:
1328+
msg = "missing required argument language"
1329+
raise TypeError(msg)
1330+
1331+
logger.info("Creating codebase from string")
1332+
1333+
# Determine language and filename
1334+
prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
1335+
filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py"
1336+
1337+
# Create codebase using factory
1338+
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory
1339+
1340+
files = {filename: code}
1341+
1342+
with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir:
1343+
logger.info(f"Using directory: {tmp_dir}")
1344+
codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
1345+
logger.info("Codebase initialization complete")
1346+
return codebase
1347+
1348+
@classmethod
1349+
def from_files(
1350+
cls,
1351+
files: dict[str, str],
1352+
*,
1353+
language: Literal["python", "typescript"] | ProgrammingLanguage | None = None,
1354+
) -> "Codebase":
1355+
"""Creates a Codebase instance from multiple files.
1356+
1357+
Args:
1358+
files: Dictionary mapping filenames to their content, e.g. {"main.py": "print('hello')"}
1359+
language: Optional language override. If not provided, will be inferred from file extensions.
1360+
All files must have extensions matching the same language.
1361+
1362+
Returns:
1363+
Codebase: A Codebase instance initialized with the provided files
1364+
1365+
Raises:
1366+
ValueError: If file extensions don't match a single language or if explicitly provided
1367+
language doesn't match the extensions
1368+
1369+
Example:
1370+
>>> # Language inferred as Python
1371+
>>> files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
1372+
>>> codebase = Codebase.from_files(files)
1373+
1374+
>>> # Language inferred as TypeScript
1375+
>>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () => <div>Hello</div>"}
1376+
>>> codebase = Codebase.from_files(files)
1377+
"""
1378+
# Create codebase using factory
1379+
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory
1380+
1381+
if not files:
1382+
msg = "No files provided"
1383+
raise ValueError(msg)
1384+
1385+
logger.info("Creating codebase from files")
1386+
1387+
prog_lang = ProgrammingLanguage.PYTHON # Default language
1388+
1389+
if files:
1390+
py_extensions = {".py"}
1391+
ts_extensions = {".ts", ".tsx", ".js", ".jsx"}
1392+
1393+
extensions = {os.path.splitext(f)[1].lower() for f in files}
1394+
inferred_lang = None
1395+
1396+
# all check to ensure that the from_files method is being used for small testing purposes only.
1397+
# If parsing an actual repo, it should not be used. Instead do Codebase("path/to/repo")
1398+
if all(ext in py_extensions for ext in extensions):
1399+
inferred_lang = ProgrammingLanguage.PYTHON
1400+
elif all(ext in ts_extensions for ext in extensions):
1401+
inferred_lang = ProgrammingLanguage.TYPESCRIPT
1402+
else:
1403+
msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)"
1404+
raise ValueError(msg)
1405+
1406+
if language is not None:
1407+
explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
1408+
if explicit_lang != inferred_lang:
1409+
msg = f"Provided language {explicit_lang} doesn't match inferred language {inferred_lang} from file extensions"
1410+
raise ValueError(msg)
1411+
1412+
prog_lang = inferred_lang
1413+
else:
1414+
# Default to Python if no files provided
1415+
prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language)
1416+
1417+
logger.info(f"Using language: {prog_lang}")
1418+
1419+
with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir:
1420+
logger.info(f"Using directory: {tmp_dir}")
1421+
1422+
# Initialize git repo to avoid "not in a git repository" error
1423+
import subprocess
1424+
1425+
subprocess.run(["git", "init"], cwd=tmp_dir, check=True, capture_output=True)
1426+
1427+
codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
1428+
logger.info("Codebase initialization complete")
1429+
return codebase
1430+
13011431
def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]:
13021432
"""Get all modified symbols in a pull request"""
13031433
pr = self._op.get_pull_request(pr_id)
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import pytest
2+
3+
from codegen.sdk.core.codebase import Codebase
4+
5+
6+
def test_from_files_python():
7+
"""Test creating a Python codebase from multiple files"""
8+
files = {"main.py": "from utils import add\nprint(add(1, 2))", "utils.py": "def add(a, b):\n return a + b"}
9+
# Language is optional, will be inferred
10+
codebase = Codebase.from_files(files)
11+
assert len(codebase.files) == 2
12+
assert any(f.filepath.endswith("main.py") for f in codebase.files)
13+
assert any(f.filepath.endswith("utils.py") for f in codebase.files)
14+
assert any("from utils import add" in f.source for f in codebase.files)
15+
16+
17+
def test_from_files_typescript():
18+
"""Test creating a TypeScript codebase from multiple files"""
19+
files = {"index.ts": "import { add } from './utils';\nconsole.log(add(1, 2));", "utils.ts": "export function add(a: number, b: number): number {\n return a + b;\n}"}
20+
# Language is optional, will be inferred
21+
codebase = Codebase.from_files(files)
22+
assert len(codebase.files) == 2
23+
assert any(f.filepath.endswith("index.ts") for f in codebase.files)
24+
assert any(f.filepath.endswith("utils.ts") for f in codebase.files)
25+
assert any("import { add }" in f.source for f in codebase.files)
26+
27+
28+
def test_from_files_empty():
29+
"""Test creating a codebase with no files raises ValueError"""
30+
with pytest.raises(ValueError, match="No files provided"):
31+
Codebase.from_files({})
32+
33+
34+
def test_from_files_mixed_extensions():
35+
"""Test files with mixed extensions raises error"""
36+
files = {"main.py": "print('hello')", "test.ts": "console.log('world')"}
37+
with pytest.raises(ValueError, match="Cannot determine single language from extensions"):
38+
Codebase.from_files(files)
39+
40+
41+
def test_from_files_typescript_multiple_extensions():
42+
"""Test TypeScript codebase with various valid extensions"""
43+
files = {
44+
"index.ts": "console.log('hi')",
45+
"component.tsx": "export const App = () => <div>Hello</div>",
46+
"utils.js": "module.exports = { add: (a, b) => a + b }",
47+
"button.jsx": "export const Button = () => <button>Click</button>",
48+
}
49+
# Language is optional, will be inferred as TypeScript
50+
codebase = Codebase.from_files(files)
51+
assert len(codebase.files) == 4
52+
53+
54+
def test_from_files_explicit_language_mismatch():
55+
"""Test error when explicit language doesn't match extensions"""
56+
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
57+
with pytest.raises(ValueError, match="Provided language.*doesn't match inferred language"):
58+
Codebase.from_files(files, language="typescript")
59+
60+
61+
def test_from_files_explicit_language_match():
62+
"""Test explicit language matching file extensions works"""
63+
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
64+
codebase = Codebase.from_files(files, language="python")
65+
assert len(codebase.files) == 2
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import pytest
2+
3+
from codegen.sdk.core.codebase import Codebase
4+
from codegen.shared.enums.programming_language import ProgrammingLanguage
5+
6+
7+
def test_from_string_python():
8+
"""Test creating a Python codebase from string"""
9+
code = """
10+
def hello():
11+
return "world"
12+
"""
13+
codebase = Codebase.from_string(code, language="python")
14+
assert len(codebase.files) == 1
15+
assert codebase.files[0].filepath.endswith("test.py")
16+
assert "def hello" in codebase.files[0].source
17+
18+
19+
def test_from_string_typescript():
20+
"""Test creating a TypeScript codebase from string"""
21+
code = """
22+
function hello(): string {
23+
return "world";
24+
}
25+
"""
26+
codebase = Codebase.from_string(code, language="typescript")
27+
assert len(codebase.files) == 1
28+
assert codebase.files[0].filepath.endswith("test.ts")
29+
assert "function hello" in codebase.files[0].source
30+
31+
32+
def test_from_string_with_enum():
33+
"""Test creating a codebase using ProgrammingLanguage enum"""
34+
code = "const x = 42;"
35+
codebase = Codebase.from_string(code, language=ProgrammingLanguage.TYPESCRIPT)
36+
assert len(codebase.files) == 1
37+
assert codebase.files[0].filepath.endswith("test.ts")
38+
39+
40+
def test_from_string_invalid_syntax():
41+
"""Test that invalid syntax is still accepted (parsing happens later)"""
42+
code = "this is not valid python"
43+
codebase = Codebase.from_string(code, language="python")
44+
assert len(codebase.files) == 1
45+
assert codebase.files[0].source == code
46+
47+
48+
def test_from_string_empty():
49+
"""Test creating a codebase from empty string"""
50+
codebase = Codebase.from_string("", language="python")
51+
assert len(codebase.files) == 1
52+
assert codebase.files[0].source == ""
53+
54+
55+
def test_from_string_missing_language():
56+
"""Test that language is required"""
57+
with pytest.raises(TypeError, match="missing.*required.*argument.*language"):
58+
Codebase.from_string("print('hello')")
59+
60+
61+
def test_from_string_invalid_language():
62+
"""Test that invalid language raises error"""
63+
with pytest.raises(ValueError):
64+
Codebase.from_string("print('hello')", language="invalid")
65+
66+
67+
def test_from_string_multifile():
68+
"""Test that multifile is not supported yet"""
69+
code = """
70+
# file1.py
71+
def hello(): pass
72+
73+
# file2.py
74+
def world(): pass
75+
"""
76+
# Still works, just puts everything in one file
77+
codebase = Codebase.from_string(code, language="python")
78+
assert len(codebase.files) == 1

0 commit comments

Comments
 (0)