Skip to content

Commit 49a9bc9

Browse files
Eval Run Improvements (#702)
# Motivation Allows swe bench entries to be ran by repo. --dataset argument of `run_eval` command now takes "full", "light", or "verified" as opposed to the full name of the dataset. # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 83e84d8 commit 49a9bc9

File tree

10 files changed

+185
-66
lines changed

10 files changed

+185
-66
lines changed

codegen-examples/examples/swebench_agent_run/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,11 @@
2525
Usage: run_eval.py [OPTIONS]
2626

2727
Options:
28-
--use-existing-preds TEXT The run ID of the existing predictions to use.
29-
--dataset [princeton-nlp/SWE-bench_Lite|princeton-nlp/SWE-bench|princeton-nlp/SWE-bench-verified]
30-
The dataset to use.
28+
--use-existing-preds TEXT The run ID of the existing predictions to
29+
use.
30+
--dataset [lite|full|verified] The dataset to use.
3131
--length INTEGER The number of examples to process.
3232
--instance-id TEXT The instance ID of the example to process.
33+
--repo TEXT The repo to use.
3334
--help Show this message and exit.
3435
```
Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
from codegen.extensions.swebench.utils import SweBenchExample
22
from codegen.extensions.swebench.harness import run_agent_on_entry
33
import modal
4-
import sys
5-
from codegen.sdk.core.codebase import Codebase
64

75
image = (
86
modal.Image.debian_slim(python_version="3.13")
@@ -15,26 +13,7 @@
1513
app = modal.App(name="swebench-agent-run", image=image, secrets=[modal.Secret.from_dotenv()])
1614

1715

18-
@app.function(timeout=5 * 60)
16+
@app.function(timeout=10 * 60)
1917
async def run_agent_modal(entry: SweBenchExample):
2018
"""Modal function to process a single example from the SWE-bench dataset."""
2119
return run_agent_on_entry(entry)
22-
23-
24-
@app.cls(image=image, secrets=[modal.Secret.from_dotenv()], enable_memory_snapshot=True)
25-
class SwebenchAgentRun:
26-
repo_full_name: str = modal.parameter()
27-
commit: str = modal.parameter()
28-
codebase: Codebase | None = None
29-
30-
@modal.enter(snap=True)
31-
def load(self):
32-
self.codebase = Codebase.from_repo(repo_full_name=self.repo_full_name, commit=self.commit, language="python")
33-
34-
@modal.exit()
35-
def exit(self):
36-
sys.exit(0)
37-
38-
@modal.method()
39-
async def run(self, entry: SweBenchExample):
40-
return run_agent_on_entry(entry, codebase=self.codebase)
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"%load_ext autoreload\n",
10+
"%autoreload 2"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"from codegen.sdk.core.codebase import Codebase\n",
20+
"from codegen.extensions.swebench.utils import SWEBenchDataset, get_swe_bench_examples\n",
21+
"from run_eval import run_eval"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": null,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"examples = get_swe_bench_examples(dataset=SWEBenchDataset.LITE, split=\"test\", offset=0, length=1)"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": null,
36+
"metadata": {},
37+
"outputs": [],
38+
"source": [
39+
"codebase = Codebase.from_repo(examples[0].repo, commit=examples[0].base_commit, tmp_dir=f\"/tmp/{examples[0].instance_id}\")\n",
40+
"# this will allow us to reuse the codebase for multiple examples\n",
41+
"codebases = {examples[0].instance_id: codebase}"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": null,
47+
"metadata": {},
48+
"outputs": [],
49+
"source": [
50+
"await run_eval(use_existing_preds=None, dataset=\"lite\", length=None, instance_id=examples[0].instance_id, local=True, codebases=codebases)\n",
51+
"codebases[examples[0].instance_id].reset()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"metadata": {},
58+
"outputs": [],
59+
"source": []
60+
},
61+
{
62+
"cell_type": "code",
63+
"execution_count": null,
64+
"metadata": {},
65+
"outputs": [],
66+
"source": []
67+
}
68+
],
69+
"metadata": {
70+
"kernelspec": {
71+
"display_name": ".venv",
72+
"language": "python",
73+
"name": "python3"
74+
},
75+
"language_info": {
76+
"codemirror_mode": {
77+
"name": "ipython",
78+
"version": 3
79+
},
80+
"file_extension": ".py",
81+
"mimetype": "text/x-python",
82+
"name": "python",
83+
"nbconvert_exporter": "python",
84+
"pygments_lexer": "ipython3",
85+
"version": "3.13.1"
86+
}
87+
},
88+
"nbformat": 4,
89+
"nbformat_minor": 2
90+
}

codegen-examples/examples/swebench_agent_run/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ version = "0.1.0"
44
description = "Add your description here"
55
readme = "README.md"
66
requires-python = ">=3.12, <3.14"
7-
dependencies = ["swebench>=3.0.0", "modal>=0.73.25"]
7+
dependencies = ["modal>=0.73.25"]
88

99
[tool.setuptools]
1010
py-modules = ["entry_point", "run_eval"]

codegen-examples/examples/swebench_agent_run/run_eval.py

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,15 @@
66
import modal
77
import click
88
from datetime import datetime
9+
from codegen.extensions.swebench.harness import run_agent_on_entry
910
from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
1011
from codegen.extensions.swebench.report import generate_report
12+
from codegen.sdk.core.codebase import Codebase
1113

1214
PREDS_DNAME = Path(__file__).parent / "predictions"
1315
LOG_DIR = Path(__file__).parent / "logs"
1416

15-
SwebenchAgentRun = modal.Cls.from_name(app_name="swebench-agent-run", name="SwebenchAgentRun")
17+
run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")
1618

1719

1820
async def process_batch(examples: list[SweBenchExample], batch_size=10):
@@ -31,7 +33,7 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
3133
batch = examples[i : i + batch_size]
3234

3335
# Create tasks for this batch
34-
batch_tasks = [SwebenchAgentRun(repo_full_name=example.repo, commit=example.base_commit).run.remote.aio(example) for example in batch]
36+
batch_tasks = [run_agent_modal.remote.aio(example) for example in batch]
3537

3638
# Wait for all tasks in this batch to complete
3739
print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
@@ -88,11 +90,63 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
8890
return results
8991

9092

91-
async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
93+
def process_batch_sync(examples: list[SweBenchExample], batch_size=10, codebases: dict[str, Codebase] = {}):
94+
"""Process a batch of examples synchronously.
95+
96+
Args:
97+
examples: List of SweBenchExample objects to process
98+
batch_size: Number of examples to process in each batch.
99+
Default is 10 to avoid overwhelming the system.
100+
"""
101+
results = []
102+
103+
# Process examples in batches
104+
for i in range(0, len(examples), batch_size):
105+
batch = examples[i : i + batch_size]
106+
print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
107+
108+
# Process each example in the batch
109+
for example in batch:
110+
try:
111+
# Run the agent locally instead of using modal
112+
if codebases and example.instance_id in codebases:
113+
result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
114+
else:
115+
result = run_agent_on_entry(example)
116+
results.append(result)
117+
118+
except Exception as e:
119+
error_type = type(e).__name__
120+
error_info = {
121+
"error_type": error_type,
122+
"error_message": str(e),
123+
"traceback": traceback.format_exc(),
124+
}
125+
126+
print(f"Error processing {example.instance_id}:")
127+
print(f"Type: {error_type}")
128+
print(f"Message: {str(e)}")
129+
print("Traceback:")
130+
print(error_info["traceback"])
131+
132+
results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
133+
134+
return results
135+
136+
137+
async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None, local: bool = False, codebases: dict[str, Codebase] = {}, repo: str | None = None):
92138
run_id = use_existing_preds or str(uuid.uuid4())
139+
print(f"Run ID: {run_id}")
93140
predictions_dir = PREDS_DNAME / f"results_{run_id}"
94-
dataset = SWEBenchDataset(dataset)
95-
examples = get_swe_bench_examples(dataset=dataset, length=length, instance_id=instance_id)
141+
dataset_dict = {
142+
"lite": SWEBenchDataset.LITE,
143+
"full": SWEBenchDataset.FULL,
144+
"verified": SWEBenchDataset.VERIFIED,
145+
}
146+
dataset_enum = dataset_dict[dataset]
147+
print(repo)
148+
examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
149+
print(f"Examples:\n{'\n'.join([f'{e.instance_id} - {e.repo} - {e.base_commit}' for e in examples])}")
96150

97151
try:
98152
if use_existing_preds is None:
@@ -105,7 +159,10 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
105159
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
106160

107161
# Process all examples in parallel batches
108-
results = await process_batch(examples)
162+
if local:
163+
results = process_batch_sync(examples, codebases=codebases)
164+
else:
165+
results = await process_batch(examples)
109166

110167
# Save individual results
111168
for result in results:
@@ -147,7 +204,7 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
147204
print(f" {error_type}: {count}")
148205

149206
# Generate Report on Modal
150-
generate_report(predictions_dir, LOG_DIR, dataset, run_id)
207+
generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id)
151208
except Exception:
152209
print("Fatal error in run_eval:")
153210
traceback.print_exc()
@@ -156,11 +213,14 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
156213

157214
@click.command()
158215
@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
159-
@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
216+
@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite")
160217
@click.option("--length", help="The number of examples to process.", type=int, default=10)
161218
@click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
162-
def run_eval_command(use_existing_preds, dataset, length, instance_id):
163-
asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
219+
@click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False)
220+
@click.option("--repo", help="The repo to use.", type=str, default=None)
221+
def run_eval_command(use_existing_preds, dataset, length, instance_id, local, repo):
222+
print(f"Repo: {repo}")
223+
asyncio.run(run_eval(use_existing_preds=use_existing_preds, dataset=dataset, length=length, instance_id=instance_id, codebases=None, local=local, repo=repo))
164224

165225

166226
if __name__ == "__main__":

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ dependencies = [
7777
"urllib3>=2.0.0",
7878
"datasets",
7979
"colorlog>=6.9.0",
80+
"langsmith",
8081
]
8182

8283
license = { text = "Apache-2.0" }

src/codegen/agents/code_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
class CodeAgent:
2020
"""Agent for interacting with a codebase."""
2121

22-
def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-7-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
22+
def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
2323
"""Initialize a CodeAgent.
2424
2525
Args:

src/codegen/extensions/swebench/utils.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77

88
from datasets import load_dataset
99

10-
# Add constant for cache directory
11-
CACHE_DIR = Path.home() / ".cache" / "swebench"
12-
1310

1411
class SWEBenchDataset(Enum):
1512
LITE = "princeton-nlp/SWE-bench_Lite"
@@ -73,30 +70,30 @@ def get_swe_bench_examples(
7370
offset: int = 0,
7471
length: int = 100,
7572
instance_id: str | None = None,
73+
repo: str | None = None,
7674
) -> list[SweBenchExample]:
7775
"""Fetch examples from the SWE-bench dataset using the datasets library.
7876
7977
Args:
80-
dataset: The dataset to use (LITE, FULL, or VERIFIED)
78+
dataset: The dataset to use ("lite", "full", or "verified")
8179
split: The dataset split to use
8280
offset: Starting index for examples
8381
length: Number of examples to fetch
82+
instance_id: Optional specific instance ID to fetch
8483
8584
Returns:
8685
List of SweBenchExample objects
8786
"""
88-
# Ensure cache directory exists
89-
CACHE_DIR.mkdir(parents=True, exist_ok=True)
87+
# Convert string dataset name to enum
9088

9189
# Load the dataset with caching enabled
92-
dataset_name = dataset.value
93-
swe_bench_dataset = load_dataset(dataset_name, cache_dir=str(CACHE_DIR), download_mode="reuse_dataset_if_exists")
90+
swe_bench_dataset = load_dataset(dataset.value, download_mode="reuse_dataset_if_exists")
9491

9592
# Get the requested split
9693
split_data = swe_bench_dataset[split]
9794

9895
# Apply offset and length
99-
if instance_id:
96+
if instance_id or repo:
10097
offset = 0
10198
end_idx = len(split_data)
10299
else:
@@ -113,6 +110,8 @@ def get_swe_bench_examples(
113110
for row in selected_rows:
114111
if instance_id and row["instance_id"] != instance_id:
115112
continue
113+
if repo and row["repo"] != repo:
114+
continue
116115
example = SweBenchExample(
117116
repo=row["repo"],
118117
instance_id=row["instance_id"],
@@ -129,4 +128,4 @@ def get_swe_bench_examples(
129128
)
130129
examples.append(example)
131130

132-
return examples
131+
return examples[:length]

src/codegen/extensions/tools/reflection.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,21 +61,29 @@ def render(self) -> str:
6161
REFLECTION_SYSTEM_PROMPT = """You are an expert AI assistant specialized in reflection and strategic planning.
6262
Your task is to help organize thoughts, identify knowledge gaps, and create a strategic plan based on the information provided.
6363
64+
**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
65+
6466
You will be given:
6567
1. A summary of the current context and problem being solved
6668
2. Key information and insights gathered so far
6769
3. Current obstacles or questions that need to be addressed (if any)
6870
4. A specific aspect to focus the reflection on (if any)
6971
72+
**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
73+
7074
Your response should be structured into the following sections:
7175
1. Current Understanding - Summarize what you understand about the problem and context
7276
2. Key Insights - Highlight the most important findings and their implications
7377
3. Knowledge Gaps (if challenges are provided) - Identify what information is still missing
7478
4. Action Plan - Recommend specific next steps to move forward
7579
5. Alternative Approaches - Suggest other ways to tackle the problem
7680
81+
**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
82+
7783
Your reflection should be clear, insightful, and actionable. Focus on helping the agent make progress and double check its own work.
7884
You will not suggest the agent writes new tests or modifies existing tests.
85+
86+
**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
7987
"""
8088

8189

0 commit comments

Comments
 (0)