Skip to content

Swe bench treehacks codegen (#552) #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dependencies = [
"neo4j",
"modal>=0.73.45",
"slack-sdk",
"datasets",
]

license = { text = "Apache-2.0" }
Expand Down
Empty file.
80 changes: 80 additions & 0 deletions src/codegen/extensions/swe_bench/swe_bench_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import shutil
from collections.abc import Generator
from typing import Any

from datasets import load_dataset

from codegen.extensions.swe_bench.utils import NO_ENV_SETUP, SWEBenchEntry, SWEBenchEnvSetup, SWEBenchSplit, construct_codebase
from codegen.sdk.core.codebase import Codebase


class SWEBenchWrapper:
def __init__(self, remove_after_run: bool = False):
print("Loading SWE-bench dataset...")
self.ds = load_dataset("princeton-nlp/SWE-bench")
print("SWE-bench dataset loaded.")
self.remove_after_run = remove_after_run
self.repo_groups = self.create_repo_groups()

def create_repo_groups(self) -> dict:
# Create a list of all possible splits
SPLITS: list[SWEBenchSplit] = ["train", "dev", "test"]

# Create a nested dictionary with explicit type hints
repo_groups: dict[SWEBenchSplit, dict[str, dict[str, list[Any]]]] = {}

# Group entries from all splits
for split in SPLITS:
repo_groups[split] = {}
for entry in self.ds[split]:
repo = entry["repo"]
environment_setup_commit = entry["environment_setup_commit"]

# Initialize nested dictionaries if they don't exist
if repo not in repo_groups[split]:
repo_groups[split][repo] = {}
if environment_setup_commit not in repo_groups[split][repo]:
repo_groups[split][repo][environment_setup_commit] = []

repo_groups[split][repo][environment_setup_commit].append(entry)

return repo_groups

def get_entries_for_split(self, split: SWEBenchSplit) -> Generator[tuple[SWEBenchEnvSetup | SWEBenchEntry, Codebase], None, None]:
# ===== [ For each repo in the split ] =====
for repo in self.repo_groups[split]:
# construct the codebase for the repo
codebase = construct_codebase(repo_full_name=repo)
# ===== [ For each environment setup commit ] =====
for environment_setup_commit in self.repo_groups[split][repo]:
# yield the environment setup commit
if environment_setup_commit:
# no need to parse the codebase on the environment commit
codebase.checkout(commit=environment_setup_commit, remote=True)
yield SWEBenchEnvSetup(split=split, environment_setup_commit=environment_setup_commit), codebase
else:
yield SWEBenchEnvSetup(split=split, environment_setup_commit=NO_ENV_SETUP), codebase
# ===== [ For each test setup commit ] =====
for entry in self.repo_groups[split][repo][environment_setup_commit]:
codebase.checkout(commit=entry["base_commit"], remote=True)
# yield the test entry with a parsed codebase object
yield SWEBenchEntry(entry=entry, split=split), codebase

if codebase and self.remove_after_run:
# remove the repo from the tmp_dir
shutil.rmtree(f"/tmp/codegen/{repo}")


if __name__ == "__main__":
swe_bench_wrapper = SWEBenchWrapper()
for entry, codebase in swe_bench_wrapper.get_entries_for_split("train"):
if isinstance(entry, SWEBenchEnvSetup):
print(f"Environment setup commit: {entry.environment_setup_commit}")
# install dependencies...
elif isinstance(entry, SWEBenchEntry):
print(f"Entry: {entry.entry['instance_id']}")
problem_statement = entry.entry["problem_statement"]
print(f"Task: {problem_statement[:20]}")
# send of agent to solve tasks....

print(f"Number of files: {len(codebase.files)}")
42 changes: 42 additions & 0 deletions src/codegen/extensions/swe_bench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Literal

from pydantic import BaseModel

from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator
from codegen.git.schemas.repo_config import RepoConfig
from codegen.sdk.codebase.config import ProjectConfig
from codegen.sdk.core.codebase import Codebase, PyCodebaseType

# Define the SWEBenchSplit type using Literal
SWEBenchSplit = Literal["train", "dev", "test"]
NO_ENV_SETUP = "NO_ENV_SETUP"


class SWEBenchEnvSetup(BaseModel):
split: SWEBenchSplit
environment_setup_commit: str = NO_ENV_SETUP
Comment on lines +15 to +17
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing context for SWEBenchEnvSetup class category Documentation

Tell me more
What is the issue?

The class lacks documentation explaining its purpose and the significance of the environment_setup_commit default value.

Why this matters

Without understanding the purpose of environment setup and why NO_ENV_SETUP is used as default, developers might misuse this class or change its behavior incorrectly.

Suggested change ∙ Feature Preview
class SWEBenchEnvSetup(BaseModel):
    """Represents environment setup configuration for SWE-bench.
    
    The environment_setup_commit tracks the commit hash where environment setup
    was performed, defaulting to NO_ENV_SETUP when not applicable.
    """
    split: SWEBenchSplit
    environment_setup_commit: str = NO_ENV_SETUP
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.



class SWEBenchEntry(BaseModel):
split: SWEBenchSplit
entry: dict
Comment on lines +20 to +22
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Untyped Dictionary in Model category Design

Tell me more
What is the issue?

The 'entry' field uses a generic dict type without specifying its structure, making the model too permissive and less type-safe.

Why this matters

Using an untyped dictionary reduces code clarity, makes refactoring risky, and bypasses Pydantic's validation capabilities.

Suggested change ∙ Feature Preview

Create a specific model for the entry content:

class EntryContent(BaseModel):
    # Define specific fields expected in the entry
    field1: str
    field2: int

class SWEBenchEntry(BaseModel):
    split: SWEBenchSplit
    entry: EntryContent
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.



def construct_codebase(repo_full_name: str) -> PyCodebaseType:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing function parameter documentation category Documentation

Tell me more
What is the issue?

The function signature has type hints but lacks a docstring explaining the purpose and expected format of repo_full_name.

Why this matters

Without clear documentation about the expected format of repo_full_name (e.g., 'owner/repo'), users might pass incorrectly formatted repository names.

Suggested change ∙ Feature Preview
def construct_codebase(repo_full_name: str) -> PyCodebaseType:
    """Clones a repository and constructs a PyCodebase object.

    Args:
        repo_full_name: Repository identifier in 'owner/repo' format.
    
    Returns:
        A PyCodebase instance for the specified repository.
    """
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

repo_name = repo_full_name.split("/")[-1]
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unsafe Repository Name Extraction category Functionality

Tell me more
What is the issue?

The code assumes the repository name can be extracted by splitting on '/' and taking the last element, but this assumption might fail for repository names containing forward slashes.

Why this matters

If a repository name contains forward slashes, the extracted name will be incorrect, leading to potential issues in repository management and file operations.

Suggested change ∙ Feature Preview

Use a more robust method to extract the repository name, such as:

from urllib.parse import urlparse

def get_repo_name(repo_full_name: str) -> str:
    # Handle both URLs and organization/repo format
    path = urlparse(repo_full_name).path or repo_full_name
    return path.rstrip('/').split('/')[-1]
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non-portable Temporary Directory Path category Functionality

Tell me more
What is the issue?

Hardcoding the temporary directory path may cause functionality issues on different operating systems or environments where /tmp is not available or writable.

Why this matters

The code may fail on Windows systems or in environments with restricted /tmp access, preventing the core functionality of cloning and managing repositories.

Suggested change ∙ Feature Preview

Use a platform-independent temporary directory:

import tempfile
import os

repo_config = RepoConfig(
    name=repo_name,
    full_name=repo_full_name,
    base_dir=os.path.join(tempfile.gettempdir(), 'codegen')
)
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


# clone or pull the repo
print(f"Cloning or pulling {repo_full_name}...")
remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False)
print(f"Cloned or pulled {repo_full_name}.")

# create the project config
projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)]

# parse the codebase
print("Parsing codebase...")
codebase = Codebase(projects=projects)
print("Codebase parsed.")
Comment on lines +30 to +40
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Print Statements Instead of Logger category Logging

Tell me more
What is the issue?

Using print statements instead of proper logging framework for operational status updates.

Why this matters

Print statements don't provide timestamp, log level, or structured context that would be valuable for debugging and monitoring in production environments.

Suggested change ∙ Feature Preview
import logging

logger = logging.getLogger(__name__)

def construct_codebase(repo_full_name: str) -> PyCodebaseType:
    repo_name = repo_full_name.split("/")[-1]
    repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen")

    logger.info(f"Cloning or pulling repository {repo_full_name}")
    remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False)
    logger.info(f"Successfully cloned/pulled repository {repo_full_name}")

    projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)]

    logger.info("Starting codebase parsing")
    codebase = Codebase(projects=projects)
    logger.info("Successfully parsed codebase")

    return codebase
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

Comment on lines +38 to +40
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing Progress Tracking category Performance

Tell me more
What is the issue?

The codebase parsing operation lacks progress tracking for large repositories, which can be a time-consuming operation.

Why this matters

Without progress tracking, users have no visibility into the parsing status of large codebases, leading to uncertainty about whether the operation is progressing or stuck.

Suggested change ∙ Feature Preview

Implement a progress callback or logging system to show parsing progress:

logger.info(f"Starting codebase parsing for {repo_full_name}")
codebase = Codebase(projects=projects, progress_callback=report_progress)
logger.info(f"Completed parsing {repo_full_name}")
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.

Comment on lines +25 to +40
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mixed Logging with Business Logic category Design

Tell me more
What is the issue?

The function violates the Single Responsibility Principle by mixing logging/progress reporting with business logic.

Why this matters

Direct print statements make the function harder to test, reuse, and modify logging behavior. It also tightly couples progress reporting with codebase construction.

Suggested change ∙ Feature Preview

Extract logging to a separate logging utility or use a proper logging framework:

from logging import getLogger
logger = getLogger(__name__)

def construct_codebase(repo_full_name: str) -> PyCodebaseType:
    logger.info(f"Cloning or pulling {repo_full_name}...")
    # ... rest of the code
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


return codebase
29 changes: 29 additions & 0 deletions src/codegen/extensions/swebench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Codegen Harness and Evaluator for SWE Bennch Development Tool
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (typo): Typo: "Bennch" should be "Bench"

Suggested change
## Codegen Harness and Evaluator for SWE Bennch Development Tool
## Codegen Harness and Evaluator for SWE Bench Development Tool


This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard.

It integrates directly into the Codegen agentic framework and can be built on top of.

### Setup

Remember to install all the dependencies for the environment.

### Usage

#### Edit agent.py, your codegen agent

This file contains the main logic for the agent.

The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more.

It is invoked in the harness script.

#### Run harness.py to run the agent

This script will gather the correct dataset, run the agent, and save the results.

#### Run report.py to generate a report

This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image.

There are currently example predictions in the `predictions/results` folder.
129 changes: 129 additions & 0 deletions src/codegen/extensions/swebench/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from langchain_openai import ChatOpenAI
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (complexity): Consider removing duplicate imports and separating the demo code from the main agent code using a main guard or a dedicated test script to improve code organization and clarity for production use.

**Actionable Suggestions to Reduce Complexity:**

1. **Remove Duplicate Imports**  
   Remove one of the duplicate lines for `ChatOpenAI` and `Codebase`. For example, keep only a single import at the top:
   ```python
   from langchain_openai import ChatOpenAI
   from codegen import Codebase
   # ... other imports
  1. Separate Demo Code from Production Code
    Move the ad‐hoc demonstration code (initializing the codebase, creating the agent, and invoking it) into a dedicated test script or wrap it in a main guard. For example:
    if __name__ == "__main__":
        # Initialize codebase
        codebase = Codebase.from_repo("fastapi/fastapi")
    
        # Create the agent with GPT-4
        agent = create_codebase_agent(
            codebase=codebase,
            model_name="gpt-4o",
            temperature=0,
            verbose=True
        )
    
        # Analyze dependencies
        result = agent.invoke(
            {"input": "What are the dependencies of the FastAPI class?"},
            config={"configurable": {"session_id": "demo"}}
        )
        print(result["output"])
    This separation helps keep the core agent logic isolated and reduces confusion for production use.

from codegen import Codebase

"""Demo implementation of an agent with Codegen tools."""

from langchain.agents import AgentExecutor
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.hub import pull
from langchain.tools import BaseTool
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.messages import BaseMessage
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI

from codegen import Codebase

from codegen.extensions.langchain.tools import (
CommitTool,
CreateFileTool,
DeleteFileTool,
EditFileTool,
GithubCreatePRCommentTool,
GithubCreatePRReviewCommentTool,
GithubCreatePRTool,
GithubViewPRTool,
ListDirectoryTool,
MoveSymbolTool,
RenameFileTool,
RevealSymbolTool,
SearchTool,
SemanticEditTool,
SemanticSearchTool,
ViewFileTool,
)


def create_codebase_agent(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (code-quality): Replace mutable default arguments with None (default-mutable-arg)

codebase: Codebase,
model_name: str = "gpt-4o",
temperature: float = 0,
verbose: bool = True,
chat_history: list[BaseMessage] = [],
) -> RunnableWithMessageHistory:
"""Create an agent with all codebase tools.

Args:
codebase: The codebase to operate on
model_name: Name of the model to use (default: gpt-4)
temperature: Model temperature (default: 0)
verbose: Whether to print agent's thought process (default: True)

Returns:
Initialized agent with message history
"""
# Initialize language model
llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
)

# Get all codebase tools
tools = [
ViewFileTool(codebase),
ListDirectoryTool(codebase),
SearchTool(codebase),
EditFileTool(codebase),
CreateFileTool(codebase),
DeleteFileTool(codebase),
RenameFileTool(codebase),
MoveSymbolTool(codebase),
# RevealSymbolTool(codebase),
SemanticEditTool(codebase),
SemanticSearchTool(codebase),
CommitTool(codebase),
GithubCreatePRTool(codebase),
GithubViewPRTool(codebase),
GithubCreatePRCommentTool(codebase),
GithubCreatePRReviewCommentTool(codebase),
]

# Get the prompt to use
prompt = pull("hwchase17/openai-functions-agent")

# Create the agent
agent = OpenAIFunctionsAgent(
llm=llm,
tools=tools,
prompt=prompt,
)

# Create the agent executor
agent_executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=verbose,
)

# Create message history handler
message_history = InMemoryChatMessageHistory(messages=chat_history)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unbounded Memory Growth Risk category Performance

Tell me more
What is the issue?

Using InMemoryChatMessageHistory can lead to memory issues with large chat histories as all messages are stored in RAM.

Why this matters

For long-running sessions or large volumes of messages, memory usage will grow unbounded, potentially causing out-of-memory errors and degraded performance.

Suggested change ∙ Feature Preview

Consider implementing a persistent storage solution or a size-limited message history cache. For example:

class LimitedMemoryHistory(InMemoryChatMessageHistory):
    def __init__(self, max_messages=1000, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.max_messages = max_messages
        
    def add_message(self, message):
        super().add_message(message)
        if len(self.messages) > self.max_messages:
            self.messages = self.messages[-self.max_messages:]
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


# Wrap with message history
return RunnableWithMessageHistory(
agent_executor,
lambda session_id: message_history,
input_messages_key="input",
history_messages_key="chat_history",
)


# Initialize codebase
codebase = Codebase.from_repo("fastapi/fastapi")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inefficient Repository Loading category Performance

Tell me more
What is the issue?

Loading an entire repository synchronously without any caching strategy can be slow and resource-intensive.

Why this matters

Each time the script runs, it will download and parse the entire repository, causing unnecessary network I/O and computation overhead.

Suggested change ∙ Feature Preview

Implement repository caching and selective loading:

from pathlib import Path
import hashlib

def get_cached_repo(repo_url, cache_dir=Path("./repo_cache")):
    repo_hash = hashlib.sha256(repo_url.encode()).hexdigest()
    cache_path = cache_dir / repo_hash
    if cache_path.exists():
        return Codebase.from_path(cache_path)
    codebase = Codebase.from_repo(repo_url)
    codebase.save(cache_path)
    return codebase
Provide feedback to improve future suggestions

Nice Catch Incorrect Not in Scope Not in coding standard Other

💬 Looking for more details? Reply to this comment to chat with Korbit.


# Create the agent with GPT-4
agent = create_codebase_agent(
codebase=codebase,
model_name="gpt-4o",
temperature=0,
verbose=True
)



# Analyze dependencies
result = agent.invoke(
{"input": "What are the dependencies of the FastAPI class?"},
config={"configurable": {"session_id": "demo"}}
)
print(result["output"])

Loading
Loading