Skip to content

Support Codebase.from_string and Codebase.from_files #630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import os
import re
import tempfile
from collections.abc import Generator
from contextlib import contextmanager
from functools import cached_property
Expand Down Expand Up @@ -1298,6 +1299,135 @@ def from_repo(
logger.exception(f"Failed to initialize codebase: {e}")
raise

@classmethod
def from_string(
cls,
code: str,
*,
language: Literal["python", "typescript"] | ProgrammingLanguage,
) -> "Codebase":
"""Creates a Codebase instance from a string of code.

Args:
code: String containing code
language: Language of the code. Defaults to Python.

Returns:
Codebase: A Codebase instance initialized with the provided code

Example:
>>> # Python code
>>> code = "def add(a, b): return a + b"
>>> codebase = Codebase.from_string(code, language="python")

>>> # TypeScript code
>>> code = "function add(a: number, b: number): number { return a + b; }"
>>> codebase = Codebase.from_string(code, language="typescript")
"""
if not language:
msg = "missing required argument language"
raise TypeError(msg)

logger.info("Creating codebase from string")

# Determine language and filename
prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py"

# Create codebase using factory
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory

files = {filename: code}

with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir:
logger.info(f"Using directory: {tmp_dir}")
codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
logger.info("Codebase initialization complete")
return codebase

@classmethod
def from_files(
cls,
files: dict[str, str],
*,
language: Literal["python", "typescript"] | ProgrammingLanguage | None = None,
) -> "Codebase":
"""Creates a Codebase instance from multiple files.

Args:
files: Dictionary mapping filenames to their content, e.g. {"main.py": "print('hello')"}
language: Optional language override. If not provided, will be inferred from file extensions.
All files must have extensions matching the same language.

Returns:
Codebase: A Codebase instance initialized with the provided files

Raises:
ValueError: If file extensions don't match a single language or if explicitly provided
language doesn't match the extensions

Example:
>>> # Language inferred as Python
>>> files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
>>> codebase = Codebase.from_files(files)

>>> # Language inferred as TypeScript
>>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () => <div>Hello</div>"}
>>> codebase = Codebase.from_files(files)
"""
# Create codebase using factory
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory

if not files:
msg = "No files provided"
raise ValueError(msg)

logger.info("Creating codebase from files")

prog_lang = ProgrammingLanguage.PYTHON # Default language

if files:
py_extensions = {".py"}
ts_extensions = {".ts", ".tsx", ".js", ".jsx"}

extensions = {os.path.splitext(f)[1].lower() for f in files}
inferred_lang = None

# all check to ensure that the from_files method is being used for small testing purposes only.
# If parsing an actual repo, it should not be used. Instead do Codebase("path/to/repo")
if all(ext in py_extensions for ext in extensions):
inferred_lang = ProgrammingLanguage.PYTHON
elif all(ext in ts_extensions for ext in extensions):
inferred_lang = ProgrammingLanguage.TYPESCRIPT
else:
msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)"
raise ValueError(msg)

if language is not None:
explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
if explicit_lang != inferred_lang:
msg = f"Provided language {explicit_lang} doesn't match inferred language {inferred_lang} from file extensions"
raise ValueError(msg)

prog_lang = inferred_lang
else:
# Default to Python if no files provided
prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language)

logger.info(f"Using language: {prog_lang}")

with tempfile.TemporaryDirectory(prefix="codegen_") as tmp_dir:
logger.info(f"Using directory: {tmp_dir}")

# Initialize git repo to avoid "not in a git repository" error
import subprocess

subprocess.run(["git", "init"], cwd=tmp_dir, check=True, capture_output=True)

codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
logger.info("Codebase initialization complete")
return codebase

def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]:
"""Get all modified symbols in a pull request"""
pr = self._op.get_pull_request(pr_id)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pytest

from codegen.sdk.core.codebase import Codebase


def test_from_files_python():
"""Test creating a Python codebase from multiple files"""
files = {"main.py": "from utils import add\nprint(add(1, 2))", "utils.py": "def add(a, b):\n return a + b"}
# Language is optional, will be inferred
codebase = Codebase.from_files(files)
assert len(codebase.files) == 2
assert any(f.filepath.endswith("main.py") for f in codebase.files)
assert any(f.filepath.endswith("utils.py") for f in codebase.files)
assert any("from utils import add" in f.source for f in codebase.files)


def test_from_files_typescript():
"""Test creating a TypeScript codebase from multiple files"""
files = {"index.ts": "import { add } from './utils';\nconsole.log(add(1, 2));", "utils.ts": "export function add(a: number, b: number): number {\n return a + b;\n}"}
# Language is optional, will be inferred
codebase = Codebase.from_files(files)
assert len(codebase.files) == 2
assert any(f.filepath.endswith("index.ts") for f in codebase.files)
assert any(f.filepath.endswith("utils.ts") for f in codebase.files)
assert any("import { add }" in f.source for f in codebase.files)


def test_from_files_empty():
"""Test creating a codebase with no files raises ValueError"""
with pytest.raises(ValueError, match="No files provided"):
Codebase.from_files({})


def test_from_files_mixed_extensions():
"""Test files with mixed extensions raises error"""
files = {"main.py": "print('hello')", "test.ts": "console.log('world')"}
with pytest.raises(ValueError, match="Cannot determine single language from extensions"):
Codebase.from_files(files)


def test_from_files_typescript_multiple_extensions():
"""Test TypeScript codebase with various valid extensions"""
files = {
"index.ts": "console.log('hi')",
"component.tsx": "export const App = () => <div>Hello</div>",
"utils.js": "module.exports = { add: (a, b) => a + b }",
"button.jsx": "export const Button = () => <button>Click</button>",
}
# Language is optional, will be inferred as TypeScript
codebase = Codebase.from_files(files)
assert len(codebase.files) == 4


def test_from_files_explicit_language_mismatch():
"""Test error when explicit language doesn't match extensions"""
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
with pytest.raises(ValueError, match="Provided language.*doesn't match inferred language"):
Codebase.from_files(files, language="typescript")


def test_from_files_explicit_language_match():
"""Test explicit language matching file extensions works"""
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
codebase = Codebase.from_files(files, language="python")
assert len(codebase.files) == 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

from codegen.sdk.core.codebase import Codebase
from codegen.shared.enums.programming_language import ProgrammingLanguage


def test_from_string_python():
"""Test creating a Python codebase from string"""
code = """
def hello():
return "world"
"""
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.py")
assert "def hello" in codebase.files[0].source


def test_from_string_typescript():
"""Test creating a TypeScript codebase from string"""
code = """
function hello(): string {
return "world";
}
"""
codebase = Codebase.from_string(code, language="typescript")
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.ts")
assert "function hello" in codebase.files[0].source


def test_from_string_with_enum():
"""Test creating a codebase using ProgrammingLanguage enum"""
code = "const x = 42;"
codebase = Codebase.from_string(code, language=ProgrammingLanguage.TYPESCRIPT)
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.ts")


def test_from_string_invalid_syntax():
"""Test that invalid syntax is still accepted (parsing happens later)"""
code = "this is not valid python"
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1
assert codebase.files[0].source == code


def test_from_string_empty():
"""Test creating a codebase from empty string"""
codebase = Codebase.from_string("", language="python")
assert len(codebase.files) == 1
assert codebase.files[0].source == ""


def test_from_string_missing_language():
"""Test that language is required"""
with pytest.raises(TypeError, match="missing.*required.*argument.*language"):
Codebase.from_string("print('hello')")


def test_from_string_invalid_language():
"""Test that invalid language raises error"""
with pytest.raises(ValueError):
Codebase.from_string("print('hello')", language="invalid")


def test_from_string_multifile():
"""Test that multifile is not supported yet"""
code = """
# file1.py
def hello(): pass

# file2.py
def world(): pass
"""
# Still works, just puts everything in one file
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1
Loading