feat: ripgrep in search tool (#663)

kopekC · web-flow · commit 9c7c6e902c8a · 2025-02-26T12:39:10.000-05:00
# Motivation

&lt;!-- Why is this change necessary? --&gt;

# Content

&lt;!-- Please include a summary of the change --&gt;

# Testing

&lt;!-- How was the change tested? --&gt;

# Please check the following before marking your PR as ready for review

- [ x] I have added tests for my changes
- [ x] I have updated the documentation or added new documentation as
needed

---------

Co-authored-by: kopekC &lt;28070492+kopekC@users.noreply.github.com&gt;
diff --git a/codegen-examples/examples/swebench_agent_run/.env.template b/codegen-examples/examples/swebench_agent_run/.env.template
@@ -2,3 +2,5 @@ OPENAI_API_KEY= # Your OpenAI API key
 ANTHROPIC_API_KEY= # Your Anthropic API key
 LANGSMITH_API_KEY= # Your Langsmith API key
 LANGCHAIN_TRACING_V2= # `true` for tracing, `false` for no tracing
+LANGCHAIN_PROJECT= # Your Langchain project
+RELACE_API= # Your Relace API key
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -4,7 +4,7 @@
 
 image = (
     modal.Image.debian_slim(python_version="3.13")
-    .apt_install("git")
+    .apt_install(["git", "ripgrep"])
     .pip_install("fastapi[standard]")
     .copy_local_dir("../../../", "/root/codegen", ignore=[".venv", "**/.venv", "tests", "**/tests"])
     .run_commands("pip install -e /root/codegen")
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
@@ -111,23 +111,30 @@ def _run(self, dirpath: str = "./", depth: int = 1) -> str:
 class SearchInput(BaseModel):
     """Input for searching the codebase."""
 
-    query: str = Field(..., description="The search query, passed into python's re.match()")
+    query: str = Field(
+        ...,
+        description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True. Ripgrep is the preferred method.",
+    )
     target_directories: Optional[list[str]] = Field(default=None, description="Optional list of directories to search in")
+    file_extensions: Optional[list[str]] = Field(default=None, description="Optional list of file extensions to search (e.g. ['.py', '.ts'])")
+    page: int = Field(default=1, description="Page number to return (1-based, default: 1)")
+    files_per_page: int = Field(default=10, description="Number of files to return per page (default: 10)")
+    use_regex: bool = Field(default=False, description="Whether to treat query as a regex pattern (default: False)")
 
 
 class SearchTool(BaseTool):
     """Tool for searching the codebase."""
 
     name: ClassVar[str] = "search"
-    description: ClassVar[str] = "Search the codebase using text search"
+    description: ClassVar[str] = "Search the codebase using text search or regex pattern matching"
     args_schema: ClassVar[type[BaseModel]] = SearchInput
     codebase: Codebase = Field(exclude=True)
 
     def __init__(self, codebase: Codebase) -> None:
         super().__init__(codebase=codebase)
 
-    def _run(self, query: str, target_directories: Optional[list[str]] = None) -> str:
-        result = search(self.codebase, query, target_directories)
+    def _run(self, query: str, target_directories: Optional[list[str]] = None, file_extensions: Optional[list[str]] = None, page: int = 1, files_per_page: int = 10, use_regex: bool = False) -> str:
+        result = search(self.codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
         return result.render()
 
 
diff --git a/src/codegen/extensions/mcp/codebase_tools.py b/src/codegen/extensions/mcp/codebase_tools.py
@@ -37,16 +37,19 @@ def reveal_symbol_tool(
     return json.dumps(result, indent=2)
 
 
-@mcp.tool(name="search_codebase", description="Search the codebase using text search or regex pattern matching")
+@mcp.tool(name="search_codebase", description="The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True")
 def search_codebase_tool(
-    query: str,
-    target_directories: Annotated[Optional[list[str]], "list of directories to search within"],
+    query: Annotated[str, "The search query to find in the codebase. When ripgrep is available, this will be passed as a ripgrep pattern. For regex searches, set use_regex=True."],
     codebase_dir: Annotated[str, "The root directory of your codebase"],
     codebase_language: Annotated[ProgrammingLanguage, "The language the codebase is written in"],
-    use_regex: Annotated[bool, "use regex for the search query"],
+    target_directories: Annotated[Optional[list[str]], "list of directories to search within"] = None,
+    file_extensions: Annotated[Optional[list[str]], "list of file extensions to search (e.g. ['.py', '.ts'])"] = None,
+    page: Annotated[int, "page number to return (1-based)"] = 1,
+    files_per_page: Annotated[int, "number of files to return per page"] = 10,
+    use_regex: Annotated[bool, "use regex for the search query"] = False,
 ):
     codebase = Codebase(repo_path=codebase_dir, language=codebase_language)
-    result = search(codebase, query, target_directories, use_regex=use_regex)
+    result = search(codebase, query, target_directories=target_directories, file_extensions=file_extensions, page=page, files_per_page=files_per_page, use_regex=use_regex)
     return json.dumps(result, indent=2)
 
 
diff --git a/src/codegen/extensions/tools/search.py b/src/codegen/extensions/tools/search.py
@@ -5,7 +5,9 @@
 Results are paginated with a default of 10 files per page.
 """
 
+import os
 import re
+import subprocess
 from typing import ClassVar, Optional
 
 from pydantic import Field
@@ -109,7 +111,7 @@ def render(self) -> str:
         return "\n".join(lines)
 
 
-def search(
+def _search_with_ripgrep(
     codebase: Codebase,
     query: str,
     target_directories: Optional[list[str]] = None,
@@ -118,25 +120,159 @@ def search(
     files_per_page: int = 10,
     use_regex: bool = False,
 ) -> SearchObservation:
-    """Search the codebase using text search or regex pattern matching.
+    """Search the codebase using ripgrep.
 
-    If use_regex is True, performs a regex pattern match on each line.
-    Otherwise, performs a case-insensitive text search.
-    Returns matching lines with their line numbers, grouped by file.
-    Results are paginated by files, with a default of 10 files per page.
+    This is faster than the Python implementation, especially for large codebases.
+    """
+    # Build ripgrep command
+    cmd = ["rg", "--line-number"]
+
+    # Add case insensitivity if not using regex
+    if not use_regex:
+        cmd.append("--fixed-strings")
+        cmd.append("--ignore-case")
+
+    # Add file extensions if specified
+    if file_extensions:
+        for ext in file_extensions:
+            # Remove leading dot if present
+            ext = ext[1:] if ext.startswith(".") else ext
+            cmd.extend(["--type-add", f"custom:{ext}", "--type", "custom"])
+
+    # Add target directories if specified
+    search_path = codebase.repo_path
+    if target_directories:
+        # We'll handle target directories by filtering results later
+        pass
+
+    # Add the query and path
+    cmd.append(query)
+    cmd.append(search_path)
+
+    # Run ripgrep
+    try:
+        # Use text mode and UTF-8 encoding
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            encoding="utf-8",
+            check=False,  # Don't raise exception on non-zero exit code (no matches)
+        )
+
+        # Parse the output
+        all_results: dict[str, list[SearchMatch]] = {}
+
+        # ripgrep returns non-zero exit code when no matches are found
+        if result.returncode != 0 and result.returncode != 1:
+            # Real error occurred
+            return SearchObservation(
+                status="error",
+                error=f"ripgrep error: {result.stderr}",
+                query=query,
+                page=page,
+                total_pages=0,
+                total_files=0,
+                files_per_page=files_per_page,
+                results=[],
+            )
 
-    Args:
-        codebase: The codebase to operate on
-        query: The text to search for or regex pattern to match
-        target_directories: Optional list of directories to search in
-        file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
-                        If None, searches all files ('*')
-        page: Page number to return (1-based, default: 1)
-        files_per_page: Number of files to return per page (default: 10)
-        use_regex: Whether to treat query as a regex pattern (default: False)
+        # Parse output lines
+        for line in result.stdout.splitlines():
+            # ripgrep output format: file:line:content
+            parts = line.split(":", 2)
+            if len(parts) < 3:
+                continue
+
+            filepath, line_number_str, content = parts
+
+            # Convert to relative path within the codebase
+            rel_path = os.path.relpath(filepath, codebase.repo_path)
+
+            # Skip if not in target directories
+            if target_directories and not any(rel_path.startswith(d) for d in target_directories):
+                continue
+
+            try:
+                line_number = int(line_number_str)
+
+                # Find the actual match text
+                match_text = query
+                if use_regex:
+                    # For regex, we need to find what actually matched
+                    # This is a simplification - ideally we'd use ripgrep's --json option
+                    # to get the exact match positions
+                    pattern = re.compile(query)
+                    match_obj = pattern.search(content)
+                    if match_obj:
+                        match_text = match_obj.group(0)
+
+                # Create or append to file results
+                if rel_path not in all_results:
+                    all_results[rel_path] = []
+
+                all_results[rel_path].append(
+                    SearchMatch(
+                        status="success",
+                        line_number=line_number,
+                        line=content.strip(),
+                        match=match_text,
+                    )
+                )
+            except ValueError:
+                # Skip lines with invalid line numbers
+                continue
+
+        # Convert to SearchFileResult objects
+        file_results = []
+        for filepath, matches in all_results.items():
+            file_results.append(
+                SearchFileResult(
+                    status="success",
+                    filepath=filepath,
+                    matches=sorted(matches, key=lambda x: x.line_number),
+                )
+            )
 
-    Returns:
-        SearchObservation containing search results with matches and their sources
+        # Sort results by filepath
+        file_results.sort(key=lambda x: x.filepath)
+
+        # Calculate pagination
+        total_files = len(file_results)
+        total_pages = (total_files + files_per_page - 1) // files_per_page
+        start_idx = (page - 1) * files_per_page
+        end_idx = start_idx + files_per_page
+
+        # Get the current page of results
+        paginated_results = file_results[start_idx:end_idx]
+
+        return SearchObservation(
+            status="success",
+            query=query,
+            page=page,
+            total_pages=total_pages,
+            total_files=total_files,
+            files_per_page=files_per_page,
+            results=paginated_results,
+        )
+
+    except (subprocess.SubprocessError, FileNotFoundError) as e:
+        # Let the caller handle this by falling back to Python implementation
+        raise
+
+
+def _search_with_python(
+    codebase: Codebase,
+    query: str,
+    target_directories: Optional[list[str]] = None,
+    file_extensions: Optional[list[str]] = None,
+    page: int = 1,
+    files_per_page: int = 10,
+    use_regex: bool = False,
+) -> SearchObservation:
+    """Search the codebase using Python's regex engine.
+
+    This is a fallback for when ripgrep is not available.
     """
     # Validate pagination parameters
     if page < 1:
@@ -225,3 +361,41 @@ def search(
         files_per_page=files_per_page,
         results=paginated_results,
     )
+
+
+def search(
+    codebase: Codebase,
+    query: str,
+    target_directories: Optional[list[str]] = None,
+    file_extensions: Optional[list[str]] = None,
+    page: int = 1,
+    files_per_page: int = 10,
+    use_regex: bool = False,
+) -> SearchObservation:
+    """Search the codebase using text search or regex pattern matching.
+
+    Uses ripgrep for performance when available, with fallback to Python's regex engine.
+    If use_regex is True, performs a regex pattern match on each line.
+    Otherwise, performs a case-insensitive text search.
+    Returns matching lines with their line numbers, grouped by file.
+    Results are paginated by files, with a default of 10 files per page.
+
+    Args:
+        codebase: The codebase to operate on
+        query: The text to search for or regex pattern to match
+        target_directories: Optional list of directories to search in
+        file_extensions: Optional list of file extensions to search (e.g. ['.py', '.ts']).
+                        If None, searches all files ('*')
+        page: Page number to return (1-based, default: 1)
+        files_per_page: Number of files to return per page (default: 10)
+        use_regex: Whether to treat query as a regex pattern (default: False)
+
+    Returns:
+        SearchObservation containing search results with matches and their sources
+    """
+    # Try to use ripgrep first
+    try:
+        return _search_with_ripgrep(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
+    except (FileNotFoundError, subprocess.SubprocessError):
+        # Fall back to Python implementation if ripgrep fails or isn't available
+        return _search_with_python(codebase, query, target_directories, file_extensions, page, files_per_page, use_regex)
diff --git a/tests/unit/codegen/extensions/test_tools.py b/tests/unit/codegen/extensions/test_tools.py