Skip to content

Commit 9c7c6e9

Browse files
authored
feat: ripgrep in search tool (#663)
# Motivation <!-- Why is this change necessary? --> # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ x] I have added tests for my changes - [ x] I have updated the documentation or added new documentation as needed --------- Co-authored-by: kopekC <[email protected]>
1 parent 99ad2ee commit 9c7c6e9

File tree

6 files changed

+431
-27
lines changed

6 files changed

+431
-27
lines changed

codegen-examples/examples/swebench_agent_run/.env.template

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@ OPENAI_API_KEY= # Your OpenAI API key
22
ANTHROPIC_API_KEY= # Your Anthropic API key
33
LANGSMITH_API_KEY= # Your Langsmith API key
44
LANGCHAIN_TRACING_V2= # `true` for tracing, `false` for no tracing
5+
LANGCHAIN_PROJECT= # Your Langchain project
6+
RELACE_API= # Your Relace API key

codegen-examples/examples/swebench_agent_run/entry_point.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
image = (
66
modal.Image.debian_slim(python_version="3.13")
7-
.apt_install("git")
7+
.apt_install(["git", "ripgrep"])
88
.pip_install("fastapi[standard]")
99
.copy_local_dir("../../../", "/root/codegen", ignore=[".venv", "**/.venv", "tests", "**/tests"])
1010
.run_commands("pip install -e /root/codegen")

src/codegen/extensions/langchain/tools.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,23 +111,30 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
111111
class SearchInput(BaseModel):
112112
"""Input for searching the codebase."""
113113

114-
query: str = Field(..., description="The search query, passed into python's re.match()")
114+
query: str = Field(
115+
...,
116+
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
117+
)
115118
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
119+
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
120+
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
121+
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
122+
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
116123

117124

118125
class SearchTool(BaseTool):
119126
"""Tool for searching the codebase."""
120127

121128
name: ClassVar[str] = "search"
122-
description: ClassVar[str] = "Search the codebase using text search"
129+
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
123130
args_schema: ClassVar[type[BaseModel]] = SearchInput
124131
codebase: Codebase = Field(exclude=True)
125132

126133
def __init__(self, codebase: Codebase) -> None:
127134
super().__init__(codebase=codebase)
128135

129-
def _run(self, query: str, target_directories: Optional[list[str]] = None) -> str:
130-
result = search(self.codebase, query, target_directories)
136+
def _run(self, query: str, target_directories: Optional[list[str]] = None, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False) -> str:
137+
result = search(self.codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
131138
return result.render()
132139

133140

src/codegen/extensions/mcp/codebase_tools.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,19 @@ def reveal_symbol_tool(
3737
return json.dumps(result, indent=2)
3838

3939

40-
@mcp.tool(name="search_codebase", description="Search the codebase using text search or regex pattern matching")
40+
@mcp.tool(name="search_codebase", description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True")
4141
def search_codebase_tool(
42-
query: str,
43-
target_directories: Annotated[Optional[list[str]], "list of directories to search within"],
42+
query: Annotated[str, "The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True."],
4443
codebase_dir: Annotated[str, "The root directory of your codebase"],
4544
codebase_language: Annotated[ProgrammingLanguage, "The language the codebase is written in"],
46-
use_regex: Annotated[bool, "use regex for the search query"],
45+
target_directories: Annotated[Optional[list[str]], "list of directories to search within"] = None,
46+
file_extensions: Annotated[Optional[list[str]], "list of file extensions to search (e.g. ['.py', '.ts'])"] = None,
47+
page: Annotated[int, "page number to return (1-based)"] = 1,
48+
files_per_page: Annotated[int, "number of files to return per page"] = 10,
49+
use_regex: Annotated[bool, "use regex for the search query"] = False,
4750
):
4851
codebase = Codebase(repo_path=codebase_dir, language=codebase_language)
49-
result = search(codebase, query, target_directories, use_regex=use_regex)
52+
result = search(codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
5053
return json.dumps(result, indent=2)
5154

5255

src/codegen/extensions/tools/search.py

Lines changed: 191 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
Results are paginated with a default of 10 files per page.
66
"""
77

8+
import os
89
import re
10+
import subprocess
911
from typing import ClassVar, Optional
1012

1113
from pydantic import Field
@@ -109,7 +111,7 @@ def render(self) -> str:
109111
return "\n".join(lines)
110112

111113

112-
def search(
114+
def _search_with_ripgrep(
113115
codebase: Codebase,
114116
query: str,
115117
target_directories: Optional[list[str]] = None,
@@ -118,25 +120,159 @@ def search(
118120
files_per_page: int = 10,
119121
use_regex: bool = False,
120122
) -> SearchObservation:
121-
"""Search the codebase using text search or regex pattern matching.
123+
"""Search the codebase using ripgrep.
122124
123-
If use_regex is True, performs a regex pattern match on each line.
124-
Otherwise, performs a case-insensitive text search.
125-
Returns matching lines with their line numbers, grouped by file.
126-
Results are paginated by files, with a default of 10 files per page.
125+
This is faster than the Python implementation, especially for large codebases.
126+
"""
127+
# Build ripgrep command
128+
cmd = ["rg", "--line-number"]
129+
130+
# Add case insensitivity if not using regex
131+
if not use_regex:
132+
cmd.append("--fixed-strings")
133+
cmd.append("--ignore-case")
134+
135+
# Add file extensions if specified
136+
if file_extensions:
137+
for ext in file_extensions:
138+
# Remove leading dot if present
139+
ext = ext[1:] if ext.startswith(".") else ext
140+
cmd.extend(["--type-add", f"custom:{ext}", "--type", "custom"])
141+
142+
# Add target directories if specified
143+
search_path = codebase.repo_path
144+
if target_directories:
145+
# We'll handle target directories by filtering results later
146+
pass
147+
148+
# Add the query and path
149+
cmd.append(query)
150+
cmd.append(search_path)
151+
152+
# Run ripgrep
153+
try:
154+
# Use text mode and UTF-8 encoding
155+
result = subprocess.run(
156+
cmd,
157+
capture_output=True,
158+
text=True,
159+
encoding="utf-8",
160+
check=False, # Don't raise exception on non-zero exit code (no matches)
161+
)
162+
163+
# Parse the output
164+
all_results: dict[str, list[SearchMatch]] = {}
165+
166+
# ripgrep returns non-zero exit code when no matches are found
167+
if result.returncode != 0 and result.returncode != 1:
168+
# Real error occurred
169+
return SearchObservation(
170+
status="error",
171+
error=f"ripgrep error: {result.stderr}",
172+
query=query,
173+
page=page,
174+
total_pages=0,
175+
total_files=0,
176+
files_per_page=files_per_page,
177+
results=[],
178+
)
127179

128-
Args:
129-
codebase: The codebase to operate on
130-
query: The text to search for or regex pattern to match
131-
target_directories: Optional list of directories to search in
132-
file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
133-
If None, searches all files ('*')
134-
page: Page number to return (1-based, default: 1)
135-
files_per_page: Number of files to return per page (default: 10)
136-
use_regex: Whether to treat query as a regex pattern (default: False)
180+
# Parse output lines
181+
for line in result.stdout.splitlines():
182+
# ripgrep output format: file:line:content
183+
parts = line.split(":", 2)
184+
if len(parts) < 3:
185+
continue
186+
187+
filepath, line_number_str, content = parts
188+
189+
# Convert to relative path within the codebase
190+
rel_path = os.path.relpath(filepath, codebase.repo_path)
191+
192+
# Skip if not in target directories
193+
if target_directories and not any(rel_path.startswith(d) for d in target_directories):
194+
continue
195+
196+
try:
197+
line_number = int(line_number_str)
198+
199+
# Find the actual match text
200+
match_text = query
201+
if use_regex:
202+
# For regex, we need to find what actually matched
203+
# This is a simplification - ideally we'd use ripgrep's --json option
204+
# to get the exact match positions
205+
pattern = re.compile(query)
206+
match_obj = pattern.search(content)
207+
if match_obj:
208+
match_text = match_obj.group(0)
209+
210+
# Create or append to file results
211+
if rel_path not in all_results:
212+
all_results[rel_path] = []
213+
214+
all_results[rel_path].append(
215+
SearchMatch(
216+
status="success",
217+
line_number=line_number,
218+
line=content.strip(),
219+
match=match_text,
220+
)
221+
)
222+
except ValueError:
223+
# Skip lines with invalid line numbers
224+
continue
225+
226+
# Convert to SearchFileResult objects
227+
file_results = []
228+
for filepath, matches in all_results.items():
229+
file_results.append(
230+
SearchFileResult(
231+
status="success",
232+
filepath=filepath,
233+
matches=sorted(matches, key=lambda x: x.line_number),
234+
)
235+
)
137236

138-
Returns:
139-
SearchObservation containing search results with matches and their sources
237+
# Sort results by filepath
238+
file_results.sort(key=lambda x: x.filepath)
239+
240+
# Calculate pagination
241+
total_files = len(file_results)
242+
total_pages = (total_files + files_per_page - 1) // files_per_page
243+
start_idx = (page - 1) * files_per_page
244+
end_idx = start_idx + files_per_page
245+
246+
# Get the current page of results
247+
paginated_results = file_results[start_idx:end_idx]
248+
249+
return SearchObservation(
250+
status="success",
251+
query=query,
252+
page=page,
253+
total_pages=total_pages,
254+
total_files=total_files,
255+
files_per_page=files_per_page,
256+
results=paginated_results,
257+
)
258+
259+
except (subprocess.SubprocessError, FileNotFoundError) as e:
260+
# Let the caller handle this by falling back to Python implementation
261+
raise
262+
263+
264+
def _search_with_python(
265+
codebase: Codebase,
266+
query: str,
267+
target_directories: Optional[list[str]] = None,
268+
file_extensions: Optional[list[str]] = None,
269+
page: int = 1,
270+
files_per_page: int = 10,
271+
use_regex: bool = False,
272+
) -> SearchObservation:
273+
"""Search the codebase using Python's regex engine.
274+
275+
This is a fallback for when ripgrep is not available.
140276
"""
141277
# Validate pagination parameters
142278
if page < 1:
@@ -225,3 +361,41 @@ def search(
225361
files_per_page=files_per_page,
226362
results=paginated_results,
227363
)
364+
365+
366+
def search(
367+
codebase: Codebase,
368+
query: str,
369+
target_directories: Optional[list[str]] = None,
370+
file_extensions: Optional[list[str]] = None,
371+
page: int = 1,
372+
files_per_page: int = 10,
373+
use_regex: bool = False,
374+
) -> SearchObservation:
375+
"""Search the codebase using text search or regex pattern matching.
376+
377+
Uses ripgrep for performance when available, with fallback to Python's regex engine.
378+
If use_regex is True, performs a regex pattern match on each line.
379+
Otherwise, performs a case-insensitive text search.
380+
Returns matching lines with their line numbers, grouped by file.
381+
Results are paginated by files, with a default of 10 files per page.
382+
383+
Args:
384+
codebase: The codebase to operate on
385+
query: The text to search for or regex pattern to match
386+
target_directories: Optional list of directories to search in
387+
file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
388+
If None, searches all files ('*')
389+
page: Page number to return (1-based, default: 1)
390+
files_per_page: Number of files to return per page (default: 10)
391+
use_regex: Whether to treat query as a regex pattern (default: False)
392+
393+
Returns:
394+
SearchObservation containing search results with matches and their sources
395+
"""
396+
# Try to use ripgrep first
397+
try:
398+
return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
399+
except (FileNotFoundError, subprocess.SubprocessError):
400+
# Fall back to Python implementation if ripgrep fails or isn't available
401+
return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)

0 commit comments

Comments
 (0)