Skip to content

Commit 00a9b3b

Browse files
authored
feat: Use Pink in codegen (#836)
# Motivation - More supported languages - Faster parse time - Lower memory usage # Content - Add feature flag to enable the Rust Version (pink) of the SDK, disabled by default - Add basic tests with it enabled # TODO: - Add more basic APIs to support more read operations (IE filepath) - Support edit operations - Synchronize database state - Expose and implement references
1 parent 796593a commit 00a9b3b

File tree

6 files changed

+253
-14
lines changed

6 files changed

+253
-14
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ dependencies = [
8080
"colorlog>=6.9.0",
8181
"langsmith",
8282
"langchain-xai>=0.2.1",
83+
"codegen-sdk-pink>=0.1.0",
8384
]
8485

8586
license = { text = "Apache-2.0" }
@@ -173,6 +174,7 @@ dev-dependencies = [
173174
[tool.uv.workspace]
174175
exclude = ["codegen-examples"]
175176

177+
176178
[tool.cython-lint]
177179
max-line-length = 200
178180

src/codegen/configs/models/codebase.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,19 @@
1+
from enum import IntEnum, auto
2+
13
from pydantic import Field
24

35
from codegen.configs.models.base_config import BaseConfig
46

57

8+
class PinkMode(IntEnum):
9+
# Use the python SDK for all files
10+
OFF = auto()
11+
# Use the Rust SDK for all files
12+
ALL_FILES = auto()
13+
# Use the Rust SDK for files the python SDK can't parse (non-source files)
14+
NON_SOURCE_FILES = auto()
15+
16+
617
class CodebaseConfig(BaseConfig):
718
def __init__(self, prefix: str = "CODEBASE", *args, **kwargs) -> None:
819
super().__init__(prefix=prefix, *args, **kwargs)
@@ -26,6 +37,7 @@ def __init__(self, prefix: str = "CODEBASE", *args, **kwargs) -> None:
2637
ts_language_engine: bool = False
2738
v8_ts_engine: bool = False
2839
unpacking_assignment_partial_removal: bool = True
40+
use_pink: PinkMode = PinkMode.OFF
2941

3042

3143
DefaultCodebaseConfig = CodebaseConfig()

src/codegen/sdk/codebase/codebase_context.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
from rustworkx import PyDiGraph, WeightedEdgeList
1313

14-
from codegen.configs.models.codebase import CodebaseConfig
14+
from codegen.configs.models.codebase import CodebaseConfig, PinkMode
1515
from codegen.configs.models.secrets import SecretsConfig
1616
from codegen.sdk.codebase.config import ProjectConfig, SessionOptions
1717
from codegen.sdk.codebase.config_parser import ConfigParser, get_config_parser_for_language
@@ -189,7 +189,7 @@ def __init__(
189189
logger.warning("Some features may not work as expected. Advanced static analysis will be disabled but simple file IO will still work.")
190190

191191
# Build the graph
192-
if not self.config.exp_lazy_graph:
192+
if not self.config.exp_lazy_graph and self.config.use_pink != PinkMode.ALL_FILES:
193193
self.build_graph(context.repo_operator)
194194
try:
195195
self.synced_commit = context.repo_operator.head_commit

src/codegen/sdk/core/codebase.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from rich.console import Console
2323
from typing_extensions import TypeVar, deprecated
2424

25-
from codegen.configs.models.codebase import CodebaseConfig
25+
from codegen.configs.models.codebase import CodebaseConfig, PinkMode
2626
from codegen.configs.models.secrets import SecretsConfig
2727
from codegen.git.repo_operator.repo_operator import RepoOperator
2828
from codegen.git.schemas.enums import CheckoutResult, SetupOption
@@ -212,6 +212,10 @@ def __init__(
212212
self.repo_path = Path(self._op.repo_path)
213213
self.ctx = CodebaseContext(projects, config=config, secrets=secrets, io=io, progress=progress)
214214
self.console = Console(record=True, soft_wrap=True)
215+
if self.ctx.config.use_pink != PinkMode.OFF:
216+
import codegen_sdk_pink
217+
218+
self._pink_codebase = codegen_sdk_pink.Codebase(self.repo_path)
215219

216220
# Assert config assertions
217221
# External import resolution must be enabled if syspath is enabled
@@ -304,6 +308,8 @@ def files(self, *, extensions: list[str] | Literal["*"] | None = None) -> list[T
304308
Returns:
305309
list[TSourceFile]: A sorted list of source files in the codebase.
306310
"""
311+
if self.ctx.config.use_pink == PinkMode.ALL_FILES:
312+
return self._pink_codebase.files
307313
if extensions is None and len(self.ctx.get_nodes(NodeType.FILE)) > 0:
308314
# If extensions is None AND there is at least one file in the codebase (This checks for unsupported languages or parse-off repos),
309315
# Return all source files
@@ -537,6 +543,12 @@ def has_file(self, filepath: str, ignore_case: bool = False) -> bool:
537543
Returns:
538544
bool: True if the file exists in the codebase, False otherwise.
539545
"""
546+
if self.ctx.config.use_pink == PinkMode.ALL_FILES:
547+
absolute_path = self.ctx.to_absolute(filepath)
548+
return self._pink_codebase.has_file(absolute_path)
549+
if self.ctx.config.use_pink == PinkMode.NON_SOURCE_FILES:
550+
if self._pink_codebase.has_file(filepath):
551+
return True
540552
return self.get_file(filepath, optional=True, ignore_case=ignore_case) is not None
541553

542554
@overload
@@ -559,13 +571,20 @@ def get_file(self, filepath: str, *, optional: bool = False, ignore_case: bool =
559571
Raises:
560572
ValueError: If file not found and optional=False.
561573
"""
574+
if self.ctx.config.use_pink == PinkMode.ALL_FILES:
575+
absolute_path = self.ctx.to_absolute(filepath)
576+
return self._pink_codebase.get_file(absolute_path)
562577
# Try to get the file from the graph first
563578
file = self.ctx.get_file(filepath, ignore_case=ignore_case)
564579
if file is not None:
565580
return file
581+
566582
# If the file is not in the graph, check the filesystem
567583
absolute_path = self.ctx.to_absolute(filepath)
568584
if self.ctx.io.file_exists(absolute_path):
585+
if self.ctx.config.use_pink != PinkMode.OFF:
586+
if file := self._pink_codebase.get_file(absolute_path):
587+
return file
569588
return self.ctx._get_raw_file_from_path(absolute_path)
570589
# If the file is not in the graph, check the filesystem
571590
if absolute_path.parent.exists():
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import os
2+
import sys
3+
4+
import pytest
5+
6+
from codegen.configs.models.codebase import PinkMode
7+
from codegen.sdk.codebase.config import TestFlags
8+
from codegen.sdk.codebase.factory.get_session import get_codebase_session
9+
from codegen.sdk.core.file import File, SourceFile
10+
from codegen.shared.enums.programming_language import ProgrammingLanguage
11+
12+
Config = TestFlags.model_copy(update=dict(use_pink=PinkMode.ALL_FILES))
13+
14+
15+
@pytest.mark.xfail(reason="Blocked on CG-11949")
16+
def test_file(tmpdir) -> None:
17+
file1_source = "Hello world!"
18+
file2_source = "print(123)"
19+
file3_source = b"\x89PNG"
20+
with get_codebase_session(tmpdir=tmpdir, files={"file1.txt": file1_source, "file2.py": file2_source, "file3.bin": file3_source}, config=Config) as codebase:
21+
file1 = codebase.get_file("file1.txt")
22+
assert isinstance(file1, File)
23+
assert not isinstance(file1, SourceFile)
24+
assert file1 is not None
25+
assert file1.filepath == "file1.txt"
26+
assert file1.content == file1_source
27+
assert file1.is_binary is False
28+
29+
file2 = codebase.get_file("file2.py")
30+
assert isinstance(file2, SourceFile)
31+
assert file2 is not None
32+
assert file2.filepath == "file2.py"
33+
assert file2.content == file2_source
34+
assert file2.is_binary is False
35+
36+
file3 = codebase.get_file("file3.bin")
37+
assert isinstance(file3, File)
38+
assert not isinstance(file3, SourceFile)
39+
assert file3 is not None
40+
assert file3.filepath == "file3.bin"
41+
assert file3.is_binary is True
42+
assert file3.content_bytes == file3_source
43+
with pytest.raises(ValueError):
44+
codebase.get_file("file4.txt")
45+
with pytest.raises(ValueError):
46+
codebase.get_directory("file4/")
47+
48+
49+
@pytest.mark.xfail(reason="Blocked on CG-11949")
50+
def test_codebase_files(tmpdir) -> None:
51+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, config=Config) as codebase:
52+
file1 = codebase.get_file("file1.py")
53+
file2 = codebase.get_file("file2.py")
54+
file3 = codebase.get_file("file3.bin")
55+
file4 = codebase.get_file("file4")
56+
57+
assert len(codebase.files) == 2
58+
assert {f for f in codebase.files} == {file1, file2}
59+
60+
assert len(codebase.files(extensions="*")) == 4
61+
assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4}
62+
63+
assert len(codebase.files(extensions=[".py"])) == 2
64+
assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2}
65+
66+
assert len(codebase.files(extensions=[".bin"])) == 1
67+
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}
68+
69+
70+
@pytest.mark.xfail(reason="Blocked on CG-11949")
71+
def test_codebase_files_other_language(tmpdir) -> None:
72+
with get_codebase_session(
73+
tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, programming_language=ProgrammingLanguage.OTHER, config=Config
74+
) as codebase:
75+
file1 = codebase.get_file("file1.py")
76+
file2 = codebase.get_file("file2.py")
77+
file3 = codebase.get_file("file3.bin")
78+
file4 = codebase.get_file("file4")
79+
80+
assert len(codebase.files) == 4 # Match all files if the language is OTHER
81+
assert {f for f in codebase.files} == {file1, file2, file3, file4}
82+
83+
assert len(codebase.files(extensions="*")) == 4
84+
assert {f for f in codebase.files(extensions="*")} == {file1, file2, file3, file4}
85+
86+
assert len(codebase.files(extensions=[".py"])) == 2
87+
assert {f for f in codebase.files(extensions=[".py"])} == {file1, file2}
88+
89+
assert len(codebase.files(extensions=[".bin"])) == 1
90+
assert {f for f in codebase.files(extensions=[".bin"])} == {file3}
91+
92+
93+
@pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive")
94+
@pytest.mark.xfail(reason="Blocked on CG-11949")
95+
def test_file_extensions_ignore_case(tmpdir) -> None:
96+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG", "file4": "Hello world!"}, config=Config) as codebase:
97+
file1 = codebase.get_file("file1.py")
98+
file2 = codebase.get_file("file2.py")
99+
file3 = codebase.get_file("file3.bin")
100+
file4 = codebase.get_file("file4")
101+
102+
assert len(codebase.files(extensions=[".pyi"])) == 0
103+
assert {f for f in codebase.files(extensions=[".pyi"])} == set()
104+
# Test ignore_case
105+
file1_upper = codebase.get_file("FILE1.PY", ignore_case=True)
106+
assert file1_upper is not None
107+
assert file1_upper == file1
108+
109+
file2_mixed = codebase.get_file("FiLe2.Py", ignore_case=True)
110+
assert file2_mixed is not None
111+
assert file2_mixed == file2
112+
113+
file3_upper = codebase.get_file("FILE3.BIN", ignore_case=True)
114+
assert file3_upper is not None
115+
assert file3_upper == file3
116+
117+
# Test ignore_case=False (default)
118+
assert codebase.get_file("FILE1.PY", ignore_case=False, optional=True) is None
119+
assert codebase.get_file("FiLe2.Py", ignore_case=False, optional=True) is None
120+
assert codebase.get_file("FILE3.BIN", ignore_case=False, optional=True) is None
121+
122+
123+
@pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive")
124+
@pytest.mark.xfail(reason="Blocked on CG-11949")
125+
def test_file_case_sensitivity_has_file(tmpdir) -> None:
126+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG"}, config=Config) as codebase:
127+
# Test has_file with ignore_case=True
128+
assert codebase.has_file("file1.py", ignore_case=True)
129+
assert codebase.has_file("FILE1.PY", ignore_case=True)
130+
assert codebase.has_file("FiLe1.Py", ignore_case=True)
131+
assert codebase.has_file("file2.py", ignore_case=True)
132+
assert codebase.has_file("FILE2.PY", ignore_case=True)
133+
assert codebase.has_file("FiLe2.Py", ignore_case=True)
134+
assert codebase.has_file("file3.bin", ignore_case=True)
135+
assert codebase.has_file("FILE3.BIN", ignore_case=True)
136+
assert codebase.has_file("FiLe3.BiN", ignore_case=True)
137+
138+
# Test has_file with ignore_case=False (default)
139+
assert codebase.has_file("file1.py", ignore_case=False)
140+
assert not codebase.has_file("FILE1.PY", ignore_case=False)
141+
assert not codebase.has_file("FiLe1.Py", ignore_case=False)
142+
assert codebase.has_file("file2.py", ignore_case=False)
143+
assert not codebase.has_file("FILE2.PY", ignore_case=False)
144+
assert not codebase.has_file("FiLe2.Py", ignore_case=False)
145+
assert codebase.has_file("file3.bin", ignore_case=False)
146+
assert not codebase.has_file("FILE3.BIN", ignore_case=False)
147+
assert not codebase.has_file("FiLe3.BiN", ignore_case=False)
148+
149+
150+
@pytest.mark.skipif(sys.platform == "darwin", reason="macOS is case-insensitive")
151+
@pytest.mark.xfail(reason="Blocked on CG-11949")
152+
def test_file_case_sensitivity_get_file(tmpdir) -> None:
153+
with get_codebase_session(tmpdir=tmpdir, files={"file1.py": "print(123)", "file2.py": "print(456)", "file3.bin": b"\x89PNG"}, config=Config) as codebase:
154+
file1 = codebase.get_file("file1.py")
155+
file2 = codebase.get_file("file2.py")
156+
file3 = codebase.get_file("file3.bin")
157+
158+
# Test get_file with ignore_case=True
159+
assert codebase.get_file("FILE1.PY", ignore_case=True) == file1
160+
assert codebase.get_file("FiLe1.Py", ignore_case=True) == file1
161+
assert codebase.get_file("FILE2.PY", ignore_case=True) == file2
162+
assert codebase.get_file("FiLe2.Py", ignore_case=True) == file2
163+
assert codebase.get_file("FILE3.BIN", ignore_case=True) == file3
164+
assert codebase.get_file("FiLe3.BiN", ignore_case=True) == file3
165+
166+
# Test get_file with ignore_case=False (default)
167+
assert codebase.get_file("FILE1.PY", ignore_case=False, optional=True) is None
168+
assert codebase.get_file("FiLe1.Py", ignore_case=False, optional=True) is None
169+
assert codebase.get_file("FILE2.PY", ignore_case=False, optional=True) is None
170+
assert codebase.get_file("FiLe2.Py", ignore_case=False, optional=True) is None
171+
assert codebase.get_file("FILE3.BIN", ignore_case=False, optional=True) is None
172+
assert codebase.get_file("FiLe3.BiN", ignore_case=False, optional=True) is None
173+
174+
175+
def test_minified_file(tmpdir) -> None:
176+
with get_codebase_session(
177+
tmpdir=tmpdir,
178+
files={
179+
"file1.min.js": "console.log(123)",
180+
"file2.js": open(f"{os.path.dirname(__file__)}/example.min.js").read(),
181+
},
182+
programming_language=ProgrammingLanguage.TYPESCRIPT,
183+
config=Config,
184+
) as codebase:
185+
# This should match the `*.min.js` pattern
186+
file1 = codebase.ctx.get_file("file1.min.js")
187+
assert file1 is None
188+
189+
# This should match the maximum line length threshold
190+
file2 = codebase.ctx.get_file("file2.js")
191+
assert file2 is None

0 commit comments

Comments
 (0)