-
Notifications
You must be signed in to change notification settings - Fork 0
Swe bench treehacks codegen (#552) #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import shutil | ||
from collections.abc import Generator | ||
from typing import Any | ||
|
||
from datasets import load_dataset | ||
|
||
from codegen.extensions.swe_bench.utils import NO_ENV_SETUP, SWEBenchEntry, SWEBenchEnvSetup, SWEBenchSplit, construct_codebase | ||
from codegen.sdk.core.codebase import Codebase | ||
|
||
|
||
class SWEBenchWrapper: | ||
def __init__(self, remove_after_run: bool = False): | ||
print("Loading SWE-bench dataset...") | ||
self.ds = load_dataset("princeton-nlp/SWE-bench") | ||
print("SWE-bench dataset loaded.") | ||
self.remove_after_run = remove_after_run | ||
self.repo_groups = self.create_repo_groups() | ||
|
||
def create_repo_groups(self) -> dict: | ||
# Create a list of all possible splits | ||
SPLITS: list[SWEBenchSplit] = ["train", "dev", "test"] | ||
|
||
# Create a nested dictionary with explicit type hints | ||
repo_groups: dict[SWEBenchSplit, dict[str, dict[str, list[Any]]]] = {} | ||
|
||
# Group entries from all splits | ||
for split in SPLITS: | ||
repo_groups[split] = {} | ||
for entry in self.ds[split]: | ||
repo = entry["repo"] | ||
environment_setup_commit = entry["environment_setup_commit"] | ||
|
||
# Initialize nested dictionaries if they don't exist | ||
if repo not in repo_groups[split]: | ||
repo_groups[split][repo] = {} | ||
if environment_setup_commit not in repo_groups[split][repo]: | ||
repo_groups[split][repo][environment_setup_commit] = [] | ||
|
||
repo_groups[split][repo][environment_setup_commit].append(entry) | ||
|
||
return repo_groups | ||
|
||
def get_entries_for_split(self, split: SWEBenchSplit) -> Generator[tuple[SWEBenchEnvSetup | SWEBenchEntry, Codebase], None, None]: | ||
# ===== [ For each repo in the split ] ===== | ||
for repo in self.repo_groups[split]: | ||
# construct the codebase for the repo | ||
codebase = construct_codebase(repo_full_name=repo) | ||
# ===== [ For each environment setup commit ] ===== | ||
for environment_setup_commit in self.repo_groups[split][repo]: | ||
# yield the environment setup commit | ||
if environment_setup_commit: | ||
# no need to parse the codebase on the environment commit | ||
codebase.checkout(commit=environment_setup_commit, remote=True) | ||
yield SWEBenchEnvSetup(split=split, environment_setup_commit=environment_setup_commit), codebase | ||
else: | ||
yield SWEBenchEnvSetup(split=split, environment_setup_commit=NO_ENV_SETUP), codebase | ||
# ===== [ For each test setup commit ] ===== | ||
for entry in self.repo_groups[split][repo][environment_setup_commit]: | ||
codebase.checkout(commit=entry["base_commit"], remote=True) | ||
# yield the test entry with a parsed codebase object | ||
yield SWEBenchEntry(entry=entry, split=split), codebase | ||
|
||
if codebase and self.remove_after_run: | ||
# remove the repo from the tmp_dir | ||
shutil.rmtree(f"/tmp/codegen/{repo}") | ||
|
||
|
||
if __name__ == "__main__": | ||
swe_bench_wrapper = SWEBenchWrapper() | ||
for entry, codebase in swe_bench_wrapper.get_entries_for_split("train"): | ||
if isinstance(entry, SWEBenchEnvSetup): | ||
print(f"Environment setup commit: {entry.environment_setup_commit}") | ||
# install dependencies... | ||
elif isinstance(entry, SWEBenchEntry): | ||
print(f"Entry: {entry.entry['instance_id']}") | ||
problem_statement = entry.entry["problem_statement"] | ||
print(f"Task: {problem_statement[:20]}") | ||
# send of agent to solve tasks.... | ||
|
||
print(f"Number of files: {len(codebase.files)}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from typing import Literal | ||
|
||
from pydantic import BaseModel | ||
|
||
from codegen.git.repo_operator.remote_repo_operator import RemoteRepoOperator | ||
from codegen.git.schemas.repo_config import RepoConfig | ||
from codegen.sdk.codebase.config import ProjectConfig | ||
from codegen.sdk.core.codebase import Codebase, PyCodebaseType | ||
|
||
# Define the SWEBenchSplit type using Literal | ||
SWEBenchSplit = Literal["train", "dev", "test"] | ||
NO_ENV_SETUP = "NO_ENV_SETUP" | ||
|
||
|
||
class SWEBenchEnvSetup(BaseModel): | ||
split: SWEBenchSplit | ||
environment_setup_commit: str = NO_ENV_SETUP | ||
|
||
|
||
class SWEBenchEntry(BaseModel): | ||
split: SWEBenchSplit | ||
entry: dict | ||
Comment on lines
+20
to
+22
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Untyped Dictionary in Model
Tell me moreWhat is the issue?The 'entry' field uses a generic dict type without specifying its structure, making the model too permissive and less type-safe. Why this mattersUsing an untyped dictionary reduces code clarity, makes refactoring risky, and bypasses Pydantic's validation capabilities. Suggested change ∙ Feature PreviewCreate a specific model for the entry content: class EntryContent(BaseModel):
# Define specific fields expected in the entry
field1: str
field2: int
class SWEBenchEntry(BaseModel):
split: SWEBenchSplit
entry: EntryContent Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
||
|
||
def construct_codebase(repo_full_name: str) -> PyCodebaseType: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing function parameter documentation
Tell me moreWhat is the issue?The function signature has type hints but lacks a docstring explaining the purpose and expected format of repo_full_name. Why this mattersWithout clear documentation about the expected format of repo_full_name (e.g., 'owner/repo'), users might pass incorrectly formatted repository names. Suggested change ∙ Feature Previewdef construct_codebase(repo_full_name: str) -> PyCodebaseType:
"""Clones a repository and constructs a PyCodebase object.
Args:
repo_full_name: Repository identifier in 'owner/repo' format.
Returns:
A PyCodebase instance for the specified repository.
""" Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
repo_name = repo_full_name.split("/")[-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unsafe Repository Name Extraction
Tell me moreWhat is the issue?The code assumes the repository name can be extracted by splitting on '/' and taking the last element, but this assumption might fail for repository names containing forward slashes. Why this mattersIf a repository name contains forward slashes, the extracted name will be incorrect, leading to potential issues in repository management and file operations. Suggested change ∙ Feature PreviewUse a more robust method to extract the repository name, such as: from urllib.parse import urlparse
def get_repo_name(repo_full_name: str) -> str:
# Handle both URLs and organization/repo format
path = urlparse(repo_full_name).path or repo_full_name
return path.rstrip('/').split('/')[-1] Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Non-portable Temporary Directory Path
Tell me moreWhat is the issue?Hardcoding the temporary directory path may cause functionality issues on different operating systems or environments where /tmp is not available or writable. Why this mattersThe code may fail on Windows systems or in environments with restricted /tmp access, preventing the core functionality of cloning and managing repositories. Suggested change ∙ Feature PreviewUse a platform-independent temporary directory: import tempfile
import os
repo_config = RepoConfig(
name=repo_name,
full_name=repo_full_name,
base_dir=os.path.join(tempfile.gettempdir(), 'codegen')
) Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
||
# clone or pull the repo | ||
print(f"Cloning or pulling {repo_full_name}...") | ||
remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False) | ||
print(f"Cloned or pulled {repo_full_name}.") | ||
|
||
# create the project config | ||
projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)] | ||
|
||
# parse the codebase | ||
print("Parsing codebase...") | ||
codebase = Codebase(projects=projects) | ||
print("Codebase parsed.") | ||
Comment on lines
+30
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Print Statements Instead of Logger
Tell me moreWhat is the issue?Using print statements instead of proper logging framework for operational status updates. Why this mattersPrint statements don't provide timestamp, log level, or structured context that would be valuable for debugging and monitoring in production environments. Suggested change ∙ Feature Previewimport logging
logger = logging.getLogger(__name__)
def construct_codebase(repo_full_name: str) -> PyCodebaseType:
repo_name = repo_full_name.split("/")[-1]
repo_config = RepoConfig(name=repo_name, full_name=repo_full_name, base_dir="/tmp/codegen")
logger.info(f"Cloning or pulling repository {repo_full_name}")
remote_operator = RemoteRepoOperator(repo_config=repo_config, bot_commit=False)
logger.info(f"Successfully cloned/pulled repository {repo_full_name}")
projects = [ProjectConfig(repo_operator=remote_operator, base_path=None, subdirectories=None)]
logger.info("Starting codebase parsing")
codebase = Codebase(projects=projects)
logger.info("Successfully parsed codebase")
return codebase Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit.
Comment on lines
+38
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing Progress Tracking
Tell me moreWhat is the issue?The codebase parsing operation lacks progress tracking for large repositories, which can be a time-consuming operation. Why this mattersWithout progress tracking, users have no visibility into the parsing status of large codebases, leading to uncertainty about whether the operation is progressing or stuck. Suggested change ∙ Feature PreviewImplement a progress callback or logging system to show parsing progress: logger.info(f"Starting codebase parsing for {repo_full_name}")
codebase = Codebase(projects=projects, progress_callback=report_progress)
logger.info(f"Completed parsing {repo_full_name}") Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit.
Comment on lines
+25
to
+40
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mixed Logging with Business Logic
Tell me moreWhat is the issue?The function violates the Single Responsibility Principle by mixing logging/progress reporting with business logic. Why this mattersDirect print statements make the function harder to test, reuse, and modify logging behavior. It also tightly couples progress reporting with codebase construction. Suggested change ∙ Feature PreviewExtract logging to a separate logging utility or use a proper logging framework: from logging import getLogger
logger = getLogger(__name__)
def construct_codebase(repo_full_name: str) -> PyCodebaseType:
logger.info(f"Cloning or pulling {repo_full_name}...")
# ... rest of the code Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
||
return codebase |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,29 @@ | ||||||
## Codegen Harness and Evaluator for SWE Bennch Development Tool | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (typo): Typo: "Bennch" should be "Bench"
Suggested change
|
||||||
|
||||||
This folder contains a harness and evaluator for the SWE Bench leaderboard, and enables developers to test and evaluate their codegen models on the SWE Bench leaderboard. | ||||||
|
||||||
It integrates directly into the Codegen agentic framework and can be built on top of. | ||||||
|
||||||
### Setup | ||||||
|
||||||
Remember to install all the dependencies for the environment. | ||||||
|
||||||
### Usage | ||||||
|
||||||
#### Edit agent.py, your codegen agent | ||||||
|
||||||
This file contains the main logic for the agent. | ||||||
|
||||||
The agent taps into the tree sitter using codegen. You can modify this by adding additional tools, extending its capabilities, prompts, and more. | ||||||
|
||||||
It is invoked in the harness script. | ||||||
|
||||||
#### Run harness.py to run the agent | ||||||
|
||||||
This script will gather the correct dataset, run the agent, and save the results. | ||||||
|
||||||
#### Run report.py to generate a report | ||||||
|
||||||
This script will generate a report from the results. It will loop through all the results and generate a report to evaluate each. Currently, there is an error in the docker image. | ||||||
|
||||||
There are currently example predictions in the `predictions/results` folder. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
from langchain_openai import ChatOpenAI | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (complexity): Consider removing duplicate imports and separating the demo code from the main agent code using a main guard or a dedicated test script to improve code organization and clarity for production use. **Actionable Suggestions to Reduce Complexity:**
1. **Remove Duplicate Imports**
Remove one of the duplicate lines for `ChatOpenAI` and `Codebase`. For example, keep only a single import at the top:
```python
from langchain_openai import ChatOpenAI
from codegen import Codebase
# ... other imports
|
||
from codegen import Codebase | ||
|
||
"""Demo implementation of an agent with Codegen tools.""" | ||
|
||
from langchain.agents import AgentExecutor | ||
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent | ||
from langchain.hub import pull | ||
from langchain.tools import BaseTool | ||
from langchain_core.chat_history import InMemoryChatMessageHistory | ||
from langchain_core.messages import BaseMessage | ||
from langchain_core.runnables.history import RunnableWithMessageHistory | ||
from langchain_openai import ChatOpenAI | ||
|
||
from codegen import Codebase | ||
|
||
from codegen.extensions.langchain.tools import ( | ||
CommitTool, | ||
CreateFileTool, | ||
DeleteFileTool, | ||
EditFileTool, | ||
GithubCreatePRCommentTool, | ||
GithubCreatePRReviewCommentTool, | ||
GithubCreatePRTool, | ||
GithubViewPRTool, | ||
ListDirectoryTool, | ||
MoveSymbolTool, | ||
RenameFileTool, | ||
RevealSymbolTool, | ||
SearchTool, | ||
SemanticEditTool, | ||
SemanticSearchTool, | ||
ViewFileTool, | ||
) | ||
|
||
|
||
def create_codebase_agent( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. issue (code-quality): Replace mutable default arguments with None ( |
||
codebase: Codebase, | ||
model_name: str = "gpt-4o", | ||
temperature: float = 0, | ||
verbose: bool = True, | ||
chat_history: list[BaseMessage] = [], | ||
) -> RunnableWithMessageHistory: | ||
"""Create an agent with all codebase tools. | ||
|
||
Args: | ||
codebase: The codebase to operate on | ||
model_name: Name of the model to use (default: gpt-4) | ||
temperature: Model temperature (default: 0) | ||
verbose: Whether to print agent's thought process (default: True) | ||
|
||
Returns: | ||
Initialized agent with message history | ||
""" | ||
# Initialize language model | ||
llm = ChatOpenAI( | ||
model_name=model_name, | ||
temperature=temperature, | ||
) | ||
|
||
# Get all codebase tools | ||
tools = [ | ||
ViewFileTool(codebase), | ||
ListDirectoryTool(codebase), | ||
SearchTool(codebase), | ||
EditFileTool(codebase), | ||
CreateFileTool(codebase), | ||
DeleteFileTool(codebase), | ||
RenameFileTool(codebase), | ||
MoveSymbolTool(codebase), | ||
# RevealSymbolTool(codebase), | ||
SemanticEditTool(codebase), | ||
SemanticSearchTool(codebase), | ||
CommitTool(codebase), | ||
GithubCreatePRTool(codebase), | ||
GithubViewPRTool(codebase), | ||
GithubCreatePRCommentTool(codebase), | ||
GithubCreatePRReviewCommentTool(codebase), | ||
] | ||
|
||
# Get the prompt to use | ||
prompt = pull("hwchase17/openai-functions-agent") | ||
|
||
# Create the agent | ||
agent = OpenAIFunctionsAgent( | ||
llm=llm, | ||
tools=tools, | ||
prompt=prompt, | ||
) | ||
|
||
# Create the agent executor | ||
agent_executor = AgentExecutor( | ||
agent=agent, | ||
tools=tools, | ||
verbose=verbose, | ||
) | ||
|
||
# Create message history handler | ||
message_history = InMemoryChatMessageHistory(messages=chat_history) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unbounded Memory Growth Risk
Tell me moreWhat is the issue?Using InMemoryChatMessageHistory can lead to memory issues with large chat histories as all messages are stored in RAM. Why this mattersFor long-running sessions or large volumes of messages, memory usage will grow unbounded, potentially causing out-of-memory errors and degraded performance. Suggested change ∙ Feature PreviewConsider implementing a persistent storage solution or a size-limited message history cache. For example: class LimitedMemoryHistory(InMemoryChatMessageHistory):
def __init__(self, max_messages=1000, *args, **kwargs):
super().__init__(*args, **kwargs)
self.max_messages = max_messages
def add_message(self, message):
super().add_message(message)
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:] Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
||
# Wrap with message history | ||
return RunnableWithMessageHistory( | ||
agent_executor, | ||
lambda session_id: message_history, | ||
input_messages_key="input", | ||
history_messages_key="chat_history", | ||
) | ||
|
||
|
||
# Initialize codebase | ||
codebase = Codebase.from_repo("fastapi/fastapi") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inefficient Repository Loading
Tell me moreWhat is the issue?Loading an entire repository synchronously without any caching strategy can be slow and resource-intensive. Why this mattersEach time the script runs, it will download and parse the entire repository, causing unnecessary network I/O and computation overhead. Suggested change ∙ Feature PreviewImplement repository caching and selective loading: from pathlib import Path
import hashlib
def get_cached_repo(repo_url, cache_dir=Path("./repo_cache")):
repo_hash = hashlib.sha256(repo_url.encode()).hexdigest()
cache_path = cache_dir / repo_hash
if cache_path.exists():
return Codebase.from_path(cache_path)
codebase = Codebase.from_repo(repo_url)
codebase.save(cache_path)
return codebase Provide feedback to improve future suggestions💬 Looking for more details? Reply to this comment to chat with Korbit. |
||
|
||
# Create the agent with GPT-4 | ||
agent = create_codebase_agent( | ||
codebase=codebase, | ||
model_name="gpt-4o", | ||
temperature=0, | ||
verbose=True | ||
) | ||
|
||
|
||
|
||
# Analyze dependencies | ||
result = agent.invoke( | ||
{"input": "What are the dependencies of the FastAPI class?"}, | ||
config={"configurable": {"session_id": "demo"}} | ||
) | ||
print(result["output"]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing context for SWEBenchEnvSetup class
Tell me more
What is the issue?
The class lacks documentation explaining its purpose and the significance of the environment_setup_commit default value.
Why this matters
Without understanding the purpose of environment setup and why NO_ENV_SETUP is used as default, developers might misuse this class or change its behavior incorrectly.
Suggested change ∙ Feature Preview
Provide feedback to improve future suggestions
💬 Looking for more details? Reply to this comment to chat with Korbit.