feat: adds VectorIndex extension (#378)

jayhack · web-flow · commit e2b2da2c071b · 2025-02-09T15:28:22.000-08:00
diff --git a/docs/building-with-codegen/semantic-code-search.mdx b/docs/building-with-codegen/semantic-code-search.mdx
@@ -0,0 +1,111 @@
+---
+title: "Semantic Code Search"
+sidebarTitle: "Semantic Code Search"
+icon: "magnifying-glass"
+iconType: "solid"
+---
+
+Codegen's `VectorIndex` enables semantic code search capabilities using embeddings. This allows you to search codebases using natural language queries and find semantically related code, even when the exact terms aren't present.
+
+<Warning>This is under active development. Interested in an application? [Reach out to the team!](/introduction/about.tsx)</Warning>
+
+## Basic Usage
+
+Create and save a vector index for your codebase:
+
+```python
+from codegen.extensions import VectorIndex
+
+# Initialize with your codebase
+index = VectorIndex(codebase)
+
+# Create embeddings for all files
+index.create()
+
+# Save to disk (defaults to .codegen/vector_index.pkl)
+index.save()
+```
+
+Later, load the index and perform semantic searches:
+
+```python
+# Create a codebase
+codebase = Codebase.from_repo('fastapi/fastapi')
+
+# Load a previously created index
+index = VectorIndex(codebase)
+index.load()
+
+# Search with natural language
+results = index.similarity_search(
+    "How does FastAPI handle dependency injection?",
+    k=5  # number of results
+)
+
+# Print results with previews
+for filepath, score in results:
+    print(f"\nScore: {score:.3f} | File: {filepath}")
+    file = codebase.get_file(filepath)
+    print(f"Preview: {file.content[:200]}...")
+```
+
+<Note>
+The search uses cosine similarity between embeddings to find the most semantically related files, regardless of exact keyword matches.
+</Note>
+
+## Getting Embeddings
+
+You can also get embeddings for arbitrary text using the same model:
+
+```python
+# Get embeddings for a list of texts
+texts = [
+    "Some code or text to embed",
+    "Another piece of text"
+]
+embeddings = index.get_embeddings(texts)  # shape: (n_texts, embedding_dim)
+```
+
+## How It Works
+
+The `VectorIndex` class:
+1. Processes each file in your codebase
+2. Splits large files into chunks that fit within token limits
+3. Uses OpenAI's text-embedding-3-small model to create embeddings
+4. Stores embeddings in a numpy array for efficient similarity search
+5. Saves the index to disk for reuse
+
+When searching:
+1. Your query is converted to an embedding using the same model
+2. Cosine similarity is computed between the query and all file embeddings
+3. The most similar files are returned, along with their similarity scores
+
+<Warning>
+Creating embeddings requires an OpenAI API key with access to the embeddings endpoint.
+</Warning>
+
+## Example Searches
+
+Here are some example semantic searches that demonstrate the power of the system:
+
+```python
+# Find authentication-related code
+results = index.similarity_search(
+    "How is user authentication implemented?",
+    k=3
+)
+
+# Find error handling patterns
+results = index.similarity_search(
+    "Show me examples of error handling and custom exceptions",
+    k=3
+)
+
+# Find configuration management
+results = index.similarity_search(
+    "Where is the application configuration and settings handled?",
+    k=3
+)
+```
+
+The semantic search can understand concepts and return relevant results even when the exact terms aren't present in the code.
diff --git a/docs/mint.json b/docs/mint.json
@@ -134,6 +134,7 @@
 				"building-with-codegen/codebase-visualization",
 				"building-with-codegen/flagging-symbols",
 				"building-with-codegen/calling-out-to-llms",
+				"building-with-codegen/semantic-code-search",
 				"building-with-codegen/reducing-conditions"
 			]
 		},
diff --git a/pyproject.toml b/pyproject.toml
@@ -65,6 +65,7 @@ dependencies = [
   "langchain[openai]",
   "langchain_core",
   "langchain_openai",
+  "numpy>=2.2.2",
 ]
 
 license = { text = "Apache-2.0" }
diff --git a/src/codegen/extensions/__init__.py b/src/codegen/extensions/__init__.py
@@ -0,0 +1,5 @@
+"""Extensions for the codegen package."""
+
+from codegen.extensions.vector_index import VectorIndex
+
+__all__ = ["VectorIndex"]
diff --git a/src/codegen/extensions/langchain/tools.py b/src/codegen/extensions/langchain/tools.py
@@ -19,6 +19,7 @@
     reveal_symbol,
     search,
     semantic_edit,
+    semantic_search,
     view_file,
 )
 
@@ -317,3 +318,27 @@ def _run(
             include_dependencies=include_dependencies,
         )
         return json.dumps(result, indent=2)
+
+
+class SemanticSearchTool(BaseTool):
+    """Tool for semantic code search."""
+
+    name: ClassVar[str] = "semantic_search"
+    description: ClassVar[str] = "Search the codebase using natural language queries and semantic similarity"
+    args_schema: ClassVar[type[BaseModel]] = type(
+        "SemanticSearchInput",
+        (BaseModel,),
+        {
+            "query": (str, Field(..., description="The natural language search query")),
+            "k": (int, Field(default=5, description="Number of results to return")),
+            "preview_length": (int, Field(default=200, description="Length of content preview in characters")),
+        },
+    )
+    codebase: Codebase = Field(exclude=True)
+
+    def __init__(self, codebase: Codebase) -> None:
+        super().__init__(codebase=codebase)
+
+    def _run(self, query: str, k: int = 5, preview_length: int = 200) -> str:
+        result = semantic_search(self.codebase, query, k=k, preview_length=preview_length)
+        return json.dumps(result, indent=2)
diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py
@@ -13,6 +13,7 @@
 from .reveal_symbol import reveal_symbol
 from .search import search
 from .semantic_edit import semantic_edit
+from .semantic_search import semantic_search
 
 __all__ = [
     "commit",
@@ -29,5 +30,6 @@
     "search",
     # Semantic edit
     "semantic_edit",
+    "semantic_search",
     "view_file",
 ]
diff --git a/src/codegen/extensions/tools/semantic_search.py b/src/codegen/extensions/tools/semantic_search.py
@@ -0,0 +1,88 @@
+"""Semantic search over codebase files."""
+
+from typing import Any, Optional
+
+from codegen import Codebase
+from codegen.extensions.vector_index import VectorIndex
+
+
+def semantic_search(
+    codebase: Codebase,
+    query: str,
+    k: int = 5,
+    preview_length: int = 200,
+    index_path: Optional[str] = None,
+) -> dict[str, Any]:
+    """Search the codebase using semantic similarity.
+
+    This function provides semantic search over a codebase by using OpenAI's embeddings.
+    Currently, it loads/saves the index from disk each time, but could be optimized to
+    maintain embeddings in memory for frequently accessed codebases.
+
+    TODO(CG-XXXX): Add support for maintaining embeddings in memory across searches,
+    potentially with an LRU cache or similar mechanism to avoid recomputing embeddings
+    for frequently searched codebases.
+
+    Args:
+        codebase: The codebase to search
+        query: The search query in natural language
+        k: Number of results to return (default: 5)
+        preview_length: Length of content preview in characters (default: 200)
+        index_path: Optional path to a saved vector index
+
+    Returns:
+        Dict containing search results or error information. Format:
+        {
+            "status": "success",
+            "query": str,
+            "results": [
+                {
+                    "filepath": str,
+                    "score": float,
+                    "preview": str
+                },
+                ...
+            ]
+        }
+        Or on error:
+        {
+            "error": str
+        }
+    """
+    try:
+        # Initialize vector index
+        index = VectorIndex(codebase)
+
+        # Try to load existing index
+        try:
+            if index_path:
+                index.load(index_path)
+            else:
+                index.load()
+        except FileNotFoundError:
+            # Create new index if none exists
+            index.create()
+            index.save(index_path)
+
+        # Perform search
+        results = index.similarity_search(query, k=k)
+
+        # Format results with previews
+        formatted_results = []
+        for filepath, score in results:
+            try:
+                file = codebase.get_file(filepath)
+                preview = file.content[:preview_length].replace("\n", " ").strip()
+                if len(file.content) > preview_length:
+                    preview += "..."
+
+                formatted_results.append({"filepath": filepath, "score": float(score), "preview": preview})
+            except Exception as e:
+                # Skip files that can't be read
+                print(f"Warning: Could not read file {filepath}: {e}")
+                continue
+
+        return {"status": "success", "query": query, "results": formatted_results}
+
+    except Exception as e:
+        return {"error": f"Failed to perform semantic search: {e!s}"}
diff --git a/src/codegen/extensions/vector_index.py b/src/codegen/extensions/vector_index.py

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,7 @@`
`134`	`134`	`"building-with-codegen/codebase-visualization",`
`135`	`135`	`"building-with-codegen/flagging-symbols",`
`136`	`136`	`"building-with-codegen/calling-out-to-llms",`
	`137`	`+ "building-with-codegen/semantic-code-search",`
`137`	`138`	`"building-with-codegen/reducing-conditions"`
`138`	`139`	`]`
`139`	`140`	`},`
Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ dependencies = [`
`65`	`65`	`"langchain[openai]",`
`66`	`66`	`"langchain_core",`
`67`	`67`	`"langchain_openai",`
	`68`	`+ "numpy>=2.2.2",`
`68`	`69`	`]`
`69`	`70`
`70`	`71`	`license = { text = "Apache-2.0" }`