Skip to content

Support Codebase.from_string and Codebase.from_files #630

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions src/codegen/sdk/core/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@
TExport = TypeVar("TExport", bound="Export", default=Export)
TSGlobalVar = TypeVar("TSGlobalVar", bound="Assignment", default=Assignment)
PyGlobalVar = TypeVar("PyGlobalVar", bound="Assignment", default=Assignment)
TSDirectory = Directory[TSFile, TSSymbol, TSImportStatement, TSGlobalVar, TSClass, TSFunction, TSImport]

Check failure on line 107 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Cannot resolve name "TSDirectory" (possible cyclic definition) [misc]
PyDirectory = Directory[PyFile, PySymbol, PyImportStatement, PyGlobalVar, PyClass, PyFunction, PyImport]

Check failure on line 108 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Cannot resolve name "PyDirectory" (possible cyclic definition) [misc]


@apidoc
Expand Down Expand Up @@ -182,13 +182,13 @@
main_project = ProjectConfig.from_path(repo_path, programming_language=ProgrammingLanguage(language.upper()) if language else None)
projects = [main_project]
else:
main_project = projects[0]

Check failure on line 185 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Value of type "list[ProjectConfig] | None" is not indexable [index]

# Initialize codebase
self._op = main_project.repo_operator
self.viz = VisualizationManager(op=self._op)
self.repo_path = Path(self._op.repo_path)
self.ctx = CodebaseContext(projects, config=config, secrets=secrets, io=io, progress=progress)

Check failure on line 191 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Argument 1 to "CodebaseContext" has incompatible type "list[ProjectConfig] | None"; expected "list[ProjectConfig]" [arg-type]
self.console = Console(record=True, soft_wrap=True)

@noapidoc
Expand All @@ -204,7 +204,7 @@
yield "nodes", len(self.ctx.nodes)
yield "edges", len(self.ctx.edges)

__rich_repr__.angular = ANGULAR_STYLE

Check failure on line 207 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Callable[[Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock]], Iterable[Any | tuple[Any] | tuple[str, Any] | tuple[str, Any, Any]]]" has no attribute "angular" [attr-defined]

@property
@deprecated("Please do not use the local repo operator directly")
Expand Down Expand Up @@ -246,8 +246,8 @@

@noapidoc
def _symbols(self, symbol_type: SymbolType | None = None) -> list[TSymbol | TClass | TFunction | TGlobalVar]:
matches: list[Symbol] = self.ctx.get_nodes(NodeType.SYMBOL)

Check failure on line 249 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: Incompatible types in assignment (expression has type "list[Importable[Any]]", variable has type "list[Symbol[Any, Any]]") [assignment]
return [x for x in matches if x.is_top_level and (symbol_type is None or x.symbol_type == symbol_type)]

Check failure on line 250 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: List comprehension has incompatible type List[Symbol[Any, Any]]; expected List[TSymbol | TClass | TFunction | TGlobalVar] [misc]

# =====[ Node Types ]=====
@overload
Expand All @@ -256,7 +256,7 @@
def files(self, *, extensions: Literal["*"]) -> list[File]: ...
@overload
def files(self, *, extensions: None = ...) -> list[TSourceFile]: ...
@proxy_property

Check failure on line 259 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "cached_property[ProxyProperty[[Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock], DefaultNamedArg(list[str] | Literal['*'] | None, 'extensions')], list[TSourceFile] | list[File]]]" not callable [operator]
def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[TSourceFile] | list[File]:
"""A list property that returns all files in the codebase.

Expand Down Expand Up @@ -291,13 +291,13 @@
return sort_editables(files, alphabetical=True, dedupe=False)

@cached_property
def codeowners(self) -> list["CodeOwner[TSourceFile]"]:

Check failure on line 294 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "CodeOwner" expects 7 type arguments, but 1 given [type-arg]
"""List all CodeOnwers in the codebase.

Returns:
list[CodeOwners]: A list of CodeOwners objects in the codebase.
"""
if self.G.codeowners_parser is None:

Check failure on line 300 in src/codegen/sdk/core/codebase.py

View workflow job for this annotation

GitHub Actions / mypy

error: "Codebase[TSourceFile, TDirectory, TSymbol, TClass, TFunction, TImport, TGlobalVar, TInterface, TTypeAlias, TParameter, TCodeBlock]" has no attribute "G" [attr-defined]
return []
return CodeOwner.from_parser(self.G.codeowners_parser, lambda *args, **kwargs: self.files(*args, **kwargs))

Expand Down Expand Up @@ -1311,6 +1311,119 @@
logger.exception(f"Failed to initialize codebase: {e}")
raise

@classmethod
def from_string(
cls,
code: str,
*,
language: Literal["python", "typescript"] | ProgrammingLanguage,
) -> "Codebase":
"""Creates a Codebase instance from a string of code.

Args:
code (str): The source code string
language (Literal["python", "typescript"] | ProgrammingLanguage): The programming language of the code.

Returns:
Codebase: A Codebase instance initialized with the provided code
"""
logger.info("Creating codebase from string")

# Determine language and filename
prog_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
filename = "test.ts" if prog_lang == ProgrammingLanguage.TYPESCRIPT else "test.py"

# Create temporary directory
import tempfile
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we not import this top level?


tmp_dir = tempfile.mkdtemp(prefix="codegen_")
logger.info(f"Using directory: {tmp_dir}")

# Create codebase using factory
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory

files = {filename: code}
codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
logger.info("Codebase initialization complete")
return codebase

@classmethod
def from_files(
cls,
files: dict[str, str],
*,
language: Literal["python", "typescript"] | ProgrammingLanguage | None = None,
) -> "Codebase":
"""Creates a Codebase instance from multiple files.

Args:
files: Dictionary mapping filenames to their content, e.g. {"main.py": "print('hello')"}
language: Optional language override. If not provided, will be inferred from file extensions.
All files must have extensions matching the same language.

Returns:
Codebase: A Codebase instance initialized with the provided files

Raises:
ValueError: If file extensions don't match a single language or if explicitly provided
language doesn't match the extensions

Example:
>>> # Language inferred as Python
>>> files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
>>> codebase = Codebase.from_files(files)

>>> # Language inferred as TypeScript
>>> files = {"index.ts": "console.log('hello')", "utils.tsx": "export const App = () => <div>Hello</div>"}
>>> codebase = Codebase.from_files(files)
"""
logger.info("Creating codebase from files")

if not files:
# Default to Python if no files provided
prog_lang = ProgrammingLanguage.PYTHON if language is None else (ProgrammingLanguage(language.upper()) if isinstance(language, str) else language)
logger.info(f"No files provided, using {prog_lang}")
else:
# Map extensions to languages
py_extensions = {".py"}
ts_extensions = {".ts", ".tsx", ".js", ".jsx"}

# Get unique extensions from files
extensions = {os.path.splitext(f)[1].lower() for f in files}

# Determine language from extensions
inferred_lang = None
if all(ext in py_extensions for ext in extensions):
inferred_lang = ProgrammingLanguage.PYTHON
elif all(ext in ts_extensions for ext in extensions):
inferred_lang = ProgrammingLanguage.TYPESCRIPT
else:
msg = f"Cannot determine single language from extensions: {extensions}. Files must all be Python (.py) or TypeScript (.ts, .tsx, .js, .jsx)"
raise ValueError(msg)

# If language was explicitly provided, verify it matches inferred language
if language is not None:
explicit_lang = ProgrammingLanguage(language.upper()) if isinstance(language, str) else language
if explicit_lang != inferred_lang:
msg = f"Provided language {explicit_lang} doesn't match inferred language {inferred_lang} from file extensions"
raise ValueError(msg)

prog_lang = inferred_lang
logger.info(f"Using language: {prog_lang} ({'inferred' if language is None else 'explicit'})")

# Create temporary directory
import tempfile

tmp_dir = tempfile.mkdtemp(prefix="codegen_")
logger.info(f"Using directory: {tmp_dir}")

# Create codebase using factory
from codegen.sdk.codebase.factory.codebase_factory import CodebaseFactory

codebase = CodebaseFactory.get_codebase_from_files(repo_path=tmp_dir, files=files, programming_language=prog_lang)
logger.info("Codebase initialization complete")
return codebase

def get_modified_symbols_in_pr(self, pr_id: int) -> tuple[str, dict[str, str], list[str]]:
"""Get all modified symbols in a pull request"""
pr = self._op.get_pull_request(pr_id)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest

from codegen.sdk.core.codebase import Codebase


def test_from_files_python():
"""Test creating a Python codebase from multiple files"""
files = {"main.py": "from utils import add\nprint(add(1, 2))", "utils.py": "def add(a, b):\n return a + b"}
# Language is optional, will be inferred
codebase = Codebase.from_files(files)
assert len(codebase.files) == 2
assert any(f.filepath.endswith("main.py") for f in codebase.files)
assert any(f.filepath.endswith("utils.py") for f in codebase.files)
assert any("from utils import add" in f.content for f in codebase.files)


def test_from_files_typescript():
"""Test creating a TypeScript codebase from multiple files"""
files = {"index.ts": "import { add } from './utils';\nconsole.log(add(1, 2));", "utils.ts": "export function add(a: number, b: number): number {\n return a + b;\n}"}
# Language is optional, will be inferred
codebase = Codebase.from_files(files)
assert len(codebase.files) == 2
assert any(f.filepath.endswith("index.ts") for f in codebase.files)
assert any(f.filepath.endswith("utils.ts") for f in codebase.files)
assert any("import { add }" in f.content for f in codebase.files)


def test_from_files_empty():
"""Test creating a codebase with no files"""
# Defaults to Python when no files provided
codebase = Codebase.from_files({})
assert len(codebase.files) == 0


def test_from_files_mixed_extensions():
"""Test files with mixed extensions raises error"""
files = {"main.py": "print('hello')", "test.ts": "console.log('world')"}
with pytest.raises(ValueError, match="Cannot determine single language from extensions"):
Codebase.from_files(files)


def test_from_files_typescript_multiple_extensions():
"""Test TypeScript codebase with various valid extensions"""
files = {
"index.ts": "console.log('hi')",
"component.tsx": "export const App = () => <div>Hello</div>",
"utils.js": "module.exports = { add: (a, b) => a + b }",
"button.jsx": "export const Button = () => <button>Click</button>",
}
# Language is optional, will be inferred as TypeScript
codebase = Codebase.from_files(files)
assert len(codebase.files) == 4


def test_from_files_explicit_language_mismatch():
"""Test error when explicit language doesn't match extensions"""
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
with pytest.raises(ValueError, match="Provided language.*doesn't match inferred language"):
Codebase.from_files(files, language="typescript")


def test_from_files_explicit_language_match():
"""Test explicit language matching file extensions works"""
files = {"main.py": "print('hello')", "utils.py": "def add(a, b): return a + b"}
codebase = Codebase.from_files(files, language="python")
assert len(codebase.files) == 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

from codegen.sdk.core.codebase import Codebase
from codegen.shared.enums.programming_language import ProgrammingLanguage


def test_from_string_python():
"""Test creating a Python codebase from string"""
code = """
def hello():
return "world"
"""
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.py")
assert "def hello" in codebase.files[0].content


def test_from_string_typescript():
"""Test creating a TypeScript codebase from string"""
code = """
function hello(): string {
return "world";
}
"""
codebase = Codebase.from_string(code, language="typescript")
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.ts")
assert "function hello" in codebase.files[0].content


def test_from_string_with_enum():
"""Test creating a codebase using ProgrammingLanguage enum"""
code = "const x = 42;"
codebase = Codebase.from_string(code, language=ProgrammingLanguage.TYPESCRIPT)
assert len(codebase.files) == 1
assert codebase.files[0].filepath.endswith("test.ts")


def test_from_string_invalid_syntax():
"""Test that invalid syntax is still accepted (parsing happens later)"""
code = "this is not valid python"
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1
assert codebase.files[0].content == code


def test_from_string_empty():
"""Test creating a codebase from empty string"""
codebase = Codebase.from_string("", language="python")
assert len(codebase.files) == 1
assert codebase.files[0].content == ""


def test_from_string_missing_language():
"""Test that language is required"""
with pytest.raises(TypeError, match="missing.*required.*argument.*language"):
Codebase.from_string("print('hello')")


def test_from_string_invalid_language():
"""Test that invalid language raises error"""
with pytest.raises(ValueError):
Codebase.from_string("print('hello')", language="invalid")


def test_from_string_multifile():
"""Test that multifile is not supported yet"""
code = """
# file1.py
def hello(): pass

# file2.py
def world(): pass
"""
# Still works, just puts everything in one file
codebase = Codebase.from_string(code, language="python")
assert len(codebase.files) == 1