Skip to content

SWE-Bench harness development tool #521

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dependencies = [
"neo4j",
"modal>=0.73.45",
"slack-sdk",
"datasets",
]

license = { text = "Apache-2.0" }
Expand Down
Empty file.
80 changes: 80 additions & 0 deletions src/codegen/extensions/swe_bench/swe_bench_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import shutil
from collections.abc import Generator
from typing import Any

from datasets import load_dataset

from codegen.extensions.swe_bench.utils import NO_ENV_SETUP, SWEBenchEntry, SWEBenchEnvSetup, SWEBenchSplit, construct_codebase
from codegen.sdk.core.codebase import Codebase


class SWEBenchWrapper:
def __init__(self, remove_after_run: bool = False):
print("Loading SWE-bench dataset...")
self.ds = load_dataset("princeton-nlp/SWE-bench")
print("SWE-bench dataset loaded.")
self.remove_after_run = remove_after_run
self.repo_groups = self.create_repo_groups()

def create_repo_groups(self) -> dict:
# Create a list of all possible splits
SPLITS: list[SWEBenchSplit] = ["train", "dev", "test"]

# Create a nested dictionary with explicit type hints
repo_groups: dict[SWEBenchSplit, dict[str, dict[str, list[Any]]]] = {}

# Group entries from all splits
for split in SPLITS:
repo_groups[split] = {}
for entry in self.ds[split]:
repo = entry["repo"]
environment_setup_commit = entry["environment_setup_commit"]

# Initialize nested dictionaries if they don't exist
if repo not in repo_groups[split]:
repo_groups[split][repo] = {}
if environment_setup_commit not in repo_groups[split][repo]:
repo_groups[split][repo][environment_setup_commit] = []

repo_groups[split][repo][environment_setup_commit].append(entry)

return repo_groups

def get_entries_for_split(self, split: SWEBenchSplit) -> Generator[tuple[SWEBenchEnvSetup | SWEBenchEntry, Codebase], None, None]:
# ===== [ For each repo in the split ] =====
for repo in self.repo_groups[split]:
# construct the codebase for the repo
codebase = construct_codebase(repo_full_name=repo)
# ===== [ For each environment setup commit ] =====
for environment_setup_commit in self.repo_groups[split][repo]:
# yield the environment setup commit
if environment_setup_commit:
# no need to parse the codebase on the environment commit
codebase.checkout(commit=environment_setup_commit, remote=True)
yield SWEBenchEnvSetup(split=split, environment_setup_commit=environment_setup_commit), codebase
else:
yield SWEBenchEnvSetup(split=split, environment_setup_commit=NO_ENV_SETUP), codebase
# ===== [ For each test setup commit ] =====
for entry in self.repo_groups[split][repo][environment_setup_commit]:
codebase.checkout(commit=entry["base_commit"], remote=True)
# yield the test entry with a parsed codebase object
yield SWEBenchEntry(entry=entry, split=split), codebase

if codebase and self.remove_after_run:
# remove the repo from the tmp_dir
shutil.rmtree(f"/tmp/codegen/{repo}")


if __name__ == "__main__":
swe_bench_wrapper = SWEBenchWrapper()
for entry, codebase in swe_bench_wrapper.get_entries_for_split("train"):
if isinstance(entry, SWEBenchEnvSetup):
print(f"Environment setup commit: {entry.environment_setup_commit}")
# install dependencies...
elif isinstance(entry, SWEBenchEntry):
print(f"Entry: {entry.entry['instance_id']}")
problem_statement = entry.entry["problem_statement"]
print(f"Task: {problem_statement[:20]}")
# send of agent to solve tasks....

print(f"Number of files: {len(codebase.files)}")
42 changes: 42 additions & 0 deletions src/codegen/extensions/swe_bench/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from typing import Literal

from pydantic import BaseModel

from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator
from codegen.git.schemas.repo_config import RepoConfig
from codegen.sdk.codebase.config import ProjectConfig
from codegen.sdk.core.codebase import Codebase, PyCodebaseType

# Define the SWEBenchSplit type using Literal
SWEBenchSplit = Literal["train", "dev", "test"]
NO_ENV_SETUP = "NO_ENV_SETUP"


class SWEBenchEnvSetup(BaseModel):
split: SWEBenchSplit
environment_setup_commit: str = NO_ENV_SETUP


class SWEBenchEntry(BaseModel):
split: SWEBenchSplit
entry: dict


def construct_codebase(repo_full_name: str) -> PyCodebaseType:
repo_name = repo_full_name.split("/")[-1]
repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen")

# clone or pull the repo
print(f"Cloning or pulling {repo_full_name}...")
remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False)
print(f"Cloned or pulled {repo_full_name}.")

# create the project config
projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)]

# parse the codebase
print("Parsing codebase...")
codebase = Codebase(projects=projects)
print("Codebase parsed.")

return codebase
29 changes: 29 additions & 0 deletions src/codegen/extensions/swebench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
## Codegen Harness and Evaluator for SWE Bennch Development Tool

This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard.

It integrates directly into the Codegen agentic framework and can be built on top of.

### Setup

Remember to install all the dependencies for the environment.

### Usage

#### Edit agent.py, your codegen agent

This file contains the main logic for the agent.

The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more.

It is invoked in the harness script.

#### Run harness.py to run the agent

This script will gather the correct dataset, run the agent, and save the results.

#### Run report.py to generate a report

This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image.

There are currently example predictions in the `predictions/results` folder.
129 changes: 129 additions & 0 deletions src/codegen/extensions/swebench/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from langchain_openai import ChatOpenAI
from codegen import Codebase

"""Demo implementation of an agent with Codegen tools."""

from langchain.agents import AgentExecutor
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.hub import pull
from langchain.tools import BaseTool
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.messages import BaseMessage
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_openai import ChatOpenAI

from codegen import Codebase

from codegen.extensions.langchain.tools import (
CommitTool,
CreateFileTool,
DeleteFileTool,
EditFileTool,
GithubCreatePRCommentTool,
GithubCreatePRReviewCommentTool,
GithubCreatePRTool,
GithubViewPRTool,
ListDirectoryTool,
MoveSymbolTool,
RenameFileTool,
RevealSymbolTool,
SearchTool,
SemanticEditTool,
SemanticSearchTool,
ViewFileTool,
)


def create_codebase_agent(
codebase: Codebase,
model_name: str = "gpt-4o",
temperature: float = 0,
verbose: bool = True,
chat_history: list[BaseMessage] = [],
) -> RunnableWithMessageHistory:
"""Create an agent with all codebase tools.

Args:
codebase: The codebase to operate on
model_name: Name of the model to use (default: gpt-4)
temperature: Model temperature (default: 0)
verbose: Whether to print agent's thought process (default: True)

Returns:
Initialized agent with message history
"""
# Initialize language model
llm = ChatOpenAI(
model_name=model_name,
temperature=temperature,
)

# Get all codebase tools
tools = [
ViewFileTool(codebase),
ListDirectoryTool(codebase),
SearchTool(codebase),
EditFileTool(codebase),
CreateFileTool(codebase),
DeleteFileTool(codebase),
RenameFileTool(codebase),
MoveSymbolTool(codebase),
# RevealSymbolTool(codebase),
SemanticEditTool(codebase),
SemanticSearchTool(codebase),
CommitTool(codebase),
GithubCreatePRTool(codebase),
GithubViewPRTool(codebase),
GithubCreatePRCommentTool(codebase),
GithubCreatePRReviewCommentTool(codebase),
]

# Get the prompt to use
prompt = pull("hwchase17/openai-functions-agent")

# Create the agent
agent = OpenAIFunctionsAgent(
llm=llm,
tools=tools,
prompt=prompt,
)

# Create the agent executor
agent_executor = AgentExecutor(
agent=agent,
tools=tools,
verbose=verbose,
)

# Create message history handler
message_history = InMemoryChatMessageHistory(messages=chat_history)

# Wrap with message history
return RunnableWithMessageHistory(
agent_executor,
lambda session_id: message_history,
input_messages_key="input",
history_messages_key="chat_history",
)


# Initialize codebase
codebase = Codebase.from_repo("fastapi/fastapi")

# Create the agent with GPT-4
agent = create_codebase_agent(
codebase=codebase,
model_name="gpt-4o",
temperature=0,
verbose=True
)



# Analyze dependencies
result = agent.invoke(
{"input": "What are the dependencies of the FastAPI class?"},
config={"configurable": {"session_id": "demo"}}
)
print(result["output"])

Loading