Skip to content

feat: ripgrep in search tool #663

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions codegen-examples/examples/swebench_agent_run/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ OPENAI_API_KEY= # Your OpenAI API key
ANTHROPIC_API_KEY= # Your Anthropic API key
LANGSMITH_API_KEY= # Your Langsmith API key
LANGCHAIN_TRACING_V2= # `true` for tracing, `false` for no tracing
LANGCHAIN_PROJECT= # Your Langchain project
RELACE_API= # Your Relace API key
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

image = (
modal.Image.debian_slim(python_version="3.13")
.apt_install("git")
.apt_install(["git", "ripgrep"])
.pip_install("fastapi[standard]")
.copy_local_dir("../../../", "/root/codegen", ignore=[".venv", "**/.venv", "tests", "**/tests"])
.run_commands("pip install -e /root/codegen")
Expand Down
15 changes: 11 additions & 4 deletions src/codegen/extensions/langchain/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,23 +111,30 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
class SearchInput(BaseModel):
"""Input for searching the codebase."""

query: str = Field(..., description="The search query, passed into python's re.match()")
query: str = Field(
...,
description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
)
target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")


class SearchTool(BaseTool):
"""Tool for searching the codebase."""

name: ClassVar[str] = "search"
description: ClassVar[str] = "Search the codebase using text search"
description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
args_schema: ClassVar[type[BaseModel]] = SearchInput
codebase: Codebase = Field(exclude=True)

def __init__(self, codebase: Codebase) -> None:
super().__init__(codebase=codebase)

def _run(self, query: str, target_directories: Optional[list[str]] = None) -> str:
result = search(self.codebase, query, target_directories)
def _run(self, query: str, target_directories: Optional[list[str]] = None, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False) -> str:
result = search(self.codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
return result.render()


Expand Down
11 changes: 7 additions & 4 deletions src/codegen/extensions/mcp/codebase_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,17 @@ def reveal_symbol_tool(

@mcp.tool(name="search_codebase", description="Search the codebase using text search or regex pattern matching")
def search_codebase_tool(
query: str,
target_directories: Annotated[Optional[list[str]], "list of directories to search within"],
query: Annotated[str, "The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True."],
codebase_dir: Annotated[str, "The root directory of your codebase"],
codebase_language: Annotated[ProgrammingLanguage, "The language the codebase is written in"],
use_regex: Annotated[bool, "use regex for the search query"],
target_directories: Annotated[Optional[list[str]], "list of directories to search within"] = None,
file_extensions: Annotated[Optional[list[str]], "list of file extensions to search (e.g. ['.py', '.ts'])"] = None,
page: Annotated[int, "page number to return (1-based)"] = 1,
files_per_page: Annotated[int, "number of files to return per page"] = 10,
use_regex: Annotated[bool, "use regex for the search query"] = False,
):
codebase = Codebase(repo_path=codebase_dir, language=codebase_language)
result = search(codebase, query, target_directories, use_regex=use_regex)
result = search(codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
return json.dumps(result, indent=2)


Expand Down
208 changes: 191 additions & 17 deletions src/codegen/extensions/tools/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
Results are paginated with a default of 10 files per page.
"""

import os
import re
import subprocess
from typing import ClassVar, Optional

from pydantic import Field
Expand Down Expand Up @@ -109,7 +111,7 @@ def render(self) -> str:
return "\n".join(lines)


def search(
def _search_with_ripgrep(
codebase: Codebase,
query: str,
target_directories: Optional[list[str]] = None,
Expand All @@ -118,25 +120,159 @@ def search(
files_per_page: int = 10,
use_regex: bool = False,
) -> SearchObservation:
"""Search the codebase using text search or regex pattern matching.
"""Search the codebase using ripgrep.

If use_regex is True, performs a regex pattern match on each line.
Otherwise, performs a case-insensitive text search.
Returns matching lines with their line numbers, grouped by file.
Results are paginated by files, with a default of 10 files per page.
This is faster than the Python implementation, especially for large codebases.
"""
# Build ripgrep command
cmd = ["rg", "--line-number"]

# Add case insensitivity if not using regex
if not use_regex:
cmd.append("--fixed-strings")
cmd.append("--ignore-case")

# Add file extensions if specified
if file_extensions:
for ext in file_extensions:
# Remove leading dot if present
ext = ext[1:] if ext.startswith(".") else ext
cmd.extend(["--type-add", f"custom:{ext}", "--type", "custom"])

# Add target directories if specified
search_path = codebase.repo_path
if target_directories:
# We'll handle target directories by filtering results later
pass

# Add the query and path
cmd.append(query)
cmd.append(search_path)

# Run ripgrep
try:
# Use text mode and UTF-8 encoding
result = subprocess.run(
cmd,
capture_output=True,
text=True,
encoding="utf-8",
check=False, # Don't raise exception on non-zero exit code (no matches)
)

# Parse the output
all_results: dict[str, list[SearchMatch]] = {}

# ripgrep returns non-zero exit code when no matches are found
if result.returncode != 0 and result.returncode != 1:
# Real error occurred
return SearchObservation(
status="error",
error=f"ripgrep error: {result.stderr}",
query=query,
page=page,
total_pages=0,
total_files=0,
files_per_page=files_per_page,
results=[],
)

Args:
codebase: The codebase to operate on
query: The text to search for or regex pattern to match
target_directories: Optional list of directories to search in
file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
If None, searches all files ('*')
page: Page number to return (1-based, default: 1)
files_per_page: Number of files to return per page (default: 10)
use_regex: Whether to treat query as a regex pattern (default: False)
# Parse output lines
for line in result.stdout.splitlines():
# ripgrep output format: file:line:content
parts = line.split(":", 2)
if len(parts) < 3:
continue

filepath, line_number_str, content = parts

# Convert to relative path within the codebase
rel_path = os.path.relpath(filepath, codebase.repo_path)

# Skip if not in target directories
if target_directories and not any(rel_path.startswith(d) for d in target_directories):
continue

try:
line_number = int(line_number_str)

# Find the actual match text
match_text = query
if use_regex:
# For regex, we need to find what actually matched
# This is a simplification - ideally we'd use ripgrep's --json option
# to get the exact match positions
pattern = re.compile(query)
match_obj = pattern.search(content)
if match_obj:
match_text = match_obj.group(0)

# Create or append to file results
if rel_path not in all_results:
all_results[rel_path] = []

all_results[rel_path].append(
SearchMatch(
status="success",
line_number=line_number,
line=content.strip(),
match=match_text,
)
)
except ValueError:
# Skip lines with invalid line numbers
continue

# Convert to SearchFileResult objects
file_results = []
for filepath, matches in all_results.items():
file_results.append(
SearchFileResult(
status="success",
filepath=filepath,
matches=sorted(matches, key=lambda x: x.line_number),
)
)

Returns:
SearchObservation containing search results with matches and their sources
# Sort results by filepath
file_results.sort(key=lambda x: x.filepath)

# Calculate pagination
total_files = len(file_results)
total_pages = (total_files + files_per_page - 1) // files_per_page
start_idx = (page - 1) * files_per_page
end_idx = start_idx + files_per_page

# Get the current page of results
paginated_results = file_results[start_idx:end_idx]

return SearchObservation(
status="success",
query=query,
page=page,
total_pages=total_pages,
total_files=total_files,
files_per_page=files_per_page,
results=paginated_results,
)

except (subprocess.SubprocessError, FileNotFoundError) as e:
# Let the caller handle this by falling back to Python implementation
raise


def _search_with_python(
codebase: Codebase,
query: str,
target_directories: Optional[list[str]] = None,
file_extensions: Optional[list[str]] = None,
page: int = 1,
files_per_page: int = 10,
use_regex: bool = False,
) -> SearchObservation:
"""Search the codebase using Python's regex engine.

This is a fallback for when ripgrep is not available.
"""
# Validate pagination parameters
if page < 1:
Expand Down Expand Up @@ -225,3 +361,41 @@ def search(
files_per_page=files_per_page,
results=paginated_results,
)


def search(
codebase: Codebase,
query: str,
target_directories: Optional[list[str]] = None,
file_extensions: Optional[list[str]] = None,
page: int = 1,
files_per_page: int = 10,
use_regex: bool = False,
) -> SearchObservation:
"""Search the codebase using text search or regex pattern matching.

Uses ripgrep for performance when available, with fallback to Python's regex engine.
If use_regex is True, performs a regex pattern match on each line.
Otherwise, performs a case-insensitive text search.
Returns matching lines with their line numbers, grouped by file.
Results are paginated by files, with a default of 10 files per page.

Args:
codebase: The codebase to operate on
query: The text to search for or regex pattern to match
target_directories: Optional list of directories to search in
file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
If None, searches all files ('*')
page: Page number to return (1-based, default: 1)
files_per_page: Number of files to return per page (default: 10)
use_regex: Whether to treat query as a regex pattern (default: False)

Returns:
SearchObservation containing search results with matches and their sources
"""
# Try to use ripgrep first
try:
return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
except (FileNotFoundError, subprocess.SubprocessError):
# Fall back to Python implementation if ripgrep fails or isn't available
return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
Loading
Loading