Eval Run Improvements (#702)

jemeza-codegen · web-flow · commit 49a9bc96fefe · 2025-02-28T15:31:08.000-08:00
# Motivation

Allows swe bench entries to be ran by repo. --dataset argument of
`run_eval` command now takes "full", "light", or "verified" as opposed
to the full name of the dataset.

# Content

&lt;!-- Please include a summary of the change --&gt;

# Testing

&lt;!-- How was the change tested? --&gt;

# Please check the following before marking your PR as ready for review

- [ ] I have added tests for my changes
- [ ] I have updated the documentation or added new documentation as
needed
diff --git a/codegen-examples/examples/swebench_agent_run/README.md b/codegen-examples/examples/swebench_agent_run/README.md
@@ -25,10 +25,11 @@
   Usage: run_eval.py [OPTIONS]
 
   Options:
-  --use-existing-preds TEXT       The run ID of the existing predictions to use.
-  --dataset [princeton-nlp/SWE-bench_Lite|princeton-nlp/SWE-bench|princeton-nlp/SWE-bench-verified]
-                                  The dataset to use.
+  --use-existing-preds TEXT       The run ID of the existing predictions to
+                                  use.
+  --dataset [lite|full|verified]  The dataset to use.
   --length INTEGER                The number of examples to process.
   --instance-id TEXT              The instance ID of the example to process.
+  --repo TEXT                     The repo to use.
   --help                          Show this message and exit.
   ```
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -1,8 +1,6 @@
 from codegen.extensions.swebench.utils import SweBenchExample
 from codegen.extensions.swebench.harness import run_agent_on_entry
 import modal
-import sys
-from codegen.sdk.core.codebase import Codebase
 
 image = (
     modal.Image.debian_slim(python_version="3.13")
@@ -15,26 +13,7 @@
 app = modal.App(name="swebench-agent-run", image=image, secrets=[modal.Secret.from_dotenv()])
 
 
-@app.function(timeout=5 * 60)
+@app.function(timeout=10 * 60)
 async def run_agent_modal(entry: SweBenchExample):
     """Modal function to process a single example from the SWE-bench dataset."""
     return run_agent_on_entry(entry)
-
-
-@app.cls(image=image, secrets=[modal.Secret.from_dotenv()], enable_memory_snapshot=True)
-class SwebenchAgentRun:
-    repo_full_name: str = modal.parameter()
-    commit: str = modal.parameter()
-    codebase: Codebase | None = None
-
-    @modal.enter(snap=True)
-    def load(self):
-        self.codebase = Codebase.from_repo(repo_full_name=self.repo_full_name, commit=self.commit, language="python")
-
-    @modal.exit()
-    def exit(self):
-        sys.exit(0)
-
-    @modal.method()
-    async def run(self, entry: SweBenchExample):
-        return run_agent_on_entry(entry, codebase=self.codebase)
diff --git a/codegen-examples/examples/swebench_agent_run/local_run.ipynb b/codegen-examples/examples/swebench_agent_run/local_run.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from codegen.sdk.core.codebase import Codebase\n",
+    "from codegen.extensions.swebench.utils import SWEBenchDataset, get_swe_bench_examples\n",
+    "from run_eval import run_eval"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "examples = get_swe_bench_examples(dataset=SWEBenchDataset.LITE, split=\"test\", offset=0, length=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "codebase = Codebase.from_repo(examples[0].repo, commit=examples[0].base_commit, tmp_dir=f\"/tmp/{examples[0].instance_id}\")\n",
+    "# this will allow us to reuse the codebase for multiple examples\n",
+    "codebases = {examples[0].instance_id: codebase}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await run_eval(use_existing_preds=None, dataset=\"lite\", length=None, instance_id=examples[0].instance_id, local=True, codebases=codebases)\n",
+    "codebases[examples[0].instance_id].reset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/codegen-examples/examples/swebench_agent_run/pyproject.toml b/codegen-examples/examples/swebench_agent_run/pyproject.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12, <3.14"
-dependencies = ["swebench>=3.0.0", "modal>=0.73.25"]
+dependencies = ["modal>=0.73.25"]
 
 [tool.setuptools]
 py-modules = ["entry_point", "run_eval"]
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -6,13 +6,15 @@
 import modal
 import click
 from datetime import datetime
+from codegen.extensions.swebench.harness import run_agent_on_entry
 from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
 from codegen.extensions.swebench.report import generate_report
+from codegen.sdk.core.codebase import Codebase
 
 PREDS_DNAME = Path(__file__).parent / "predictions"
 LOG_DIR = Path(__file__).parent / "logs"
 
-SwebenchAgentRun = modal.Cls.from_name(app_name="swebench-agent-run", name="SwebenchAgentRun")
+run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")
 
 
 async def process_batch(examples: list[SweBenchExample], batch_size=10):
@@ -31,7 +33,7 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
         batch = examples[i : i + batch_size]
 
         # Create tasks for this batch
-        batch_tasks = [SwebenchAgentRun(repo_full_name=example.repo, commit=example.base_commit).run.remote.aio(example) for example in batch]
+        batch_tasks = [run_agent_modal.remote.aio(example) for example in batch]
 
         # Wait for all tasks in this batch to complete
         print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
@@ -88,11 +90,63 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
     return results
 
 
-async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
+def process_batch_sync(examples: list[SweBenchExample], batch_size=10, codebases: dict[str, Codebase] = {}):
+    """Process a batch of examples synchronously.
+
+    Args:
+        examples: List of SweBenchExample objects to process
+        batch_size: Number of examples to process in each batch.
+                   Default is 10 to avoid overwhelming the system.
+    """
+    results = []
+
+    # Process examples in batches
+    for i in range(0, len(examples), batch_size):
+        batch = examples[i : i + batch_size]
+        print(f"Processing batch {i // batch_size + 1}/{len(examples) // batch_size + 1} (examples {i + 1}-{min(i + batch_size, len(examples))})")
+
+        # Process each example in the batch
+        for example in batch:
+            try:
+                # Run the agent locally instead of using modal
+                if codebases and example.instance_id in codebases:
+                    result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
+                else:
+                    result = run_agent_on_entry(example)
+                results.append(result)
+
+            except Exception as e:
+                error_type = type(e).__name__
+                error_info = {
+                    "error_type": error_type,
+                    "error_message": str(e),
+                    "traceback": traceback.format_exc(),
+                }
+
+                print(f"Error processing {example.instance_id}:")
+                print(f"Type: {error_type}")
+                print(f"Message: {str(e)}")
+                print("Traceback:")
+                print(error_info["traceback"])
+
+                results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
+
+    return results
+
+
+async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None, local: bool = False, codebases: dict[str, Codebase] = {}, repo: str | None = None):
     run_id = use_existing_preds or str(uuid.uuid4())
+    print(f"Run ID: {run_id}")
     predictions_dir = PREDS_DNAME / f"results_{run_id}"
-    dataset = SWEBenchDataset(dataset)
-    examples = get_swe_bench_examples(dataset=dataset, length=length, instance_id=instance_id)
+    dataset_dict = {
+        "lite": SWEBenchDataset.LITE,
+        "full": SWEBenchDataset.FULL,
+        "verified": SWEBenchDataset.VERIFIED,
+    }
+    dataset_enum = dataset_dict[dataset]
+    print(repo)
+    examples = get_swe_bench_examples(dataset=dataset_enum, length=length, instance_id=instance_id, repo=repo)
+    print(f"Examples:\n{'\n'.join([f'{e.instance_id} - {e.repo} - {e.base_commit}' for e in examples])}")
 
     try:
         if use_existing_preds is None:
@@ -105,7 +159,10 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 
             # Process all examples in parallel batches
-            results = await process_batch(examples)
+            if local:
+                results = process_batch_sync(examples, codebases=codebases)
+            else:
+                results = await process_batch(examples)
 
             # Save individual results
             for result in results:
@@ -147,7 +204,7 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
                     print(f"  {error_type}: {count}")
 
         # Generate Report on Modal
-        generate_report(predictions_dir, LOG_DIR, dataset, run_id)
+        generate_report(predictions_dir, LOG_DIR, dataset_enum, run_id)
     except Exception:
         print("Fatal error in run_eval:")
         traceback.print_exc()
@@ -156,11 +213,14 @@ async def run_eval(use_existing_preds: str | None, dataset: str, length: int, in
 
 @click.command()
 @click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
-@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
+@click.option("--dataset", help="The dataset to use.", type=click.Choice(["lite", "full", "verified"]), default="lite")
 @click.option("--length", help="The number of examples to process.", type=int, default=10)
 @click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
-def run_eval_command(use_existing_preds, dataset, length, instance_id):
-    asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
+@click.option("--local", help="Run the evaluation locally.", is_flag=True, default=False)
+@click.option("--repo", help="The repo to use.", type=str, default=None)
+def run_eval_command(use_existing_preds, dataset, length, instance_id, local, repo):
+    print(f"Repo: {repo}")
+    asyncio.run(run_eval(use_existing_preds=use_existing_preds, dataset=dataset, length=length, instance_id=instance_id, codebases=None, local=local, repo=repo))
 
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
@@ -77,6 +77,7 @@ dependencies = [
   "urllib3>=2.0.0",
   "datasets",
   "colorlog>=6.9.0",
+  "langsmith",
 ]
 
 license = { text = "Apache-2.0" }
diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py
@@ -19,7 +19,7 @@
 class CodeAgent:
     """Agent for interacting with a codebase."""
 
-    def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-7-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
+    def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
         """Initialize a CodeAgent.
 
         Args:
diff --git a/src/codegen/extensions/swebench/utils.py b/src/codegen/extensions/swebench/utils.py
@@ -7,9 +7,6 @@
 
 from datasets import load_dataset
 
-# Add constant for cache directory
-CACHE_DIR = Path.home() / ".cache" / "swebench"
-
 
 class SWEBenchDataset(Enum):
     LITE = "princeton-nlp/SWE-bench_Lite"
@@ -73,30 +70,30 @@ def get_swe_bench_examples(
     offset: int = 0,
     length: int = 100,
     instance_id: str | None = None,
+    repo: str | None = None,
 ) -> list[SweBenchExample]:
     """Fetch examples from the SWE-bench dataset using the datasets library.
 
     Args:
-        dataset: The dataset to use (LITE, FULL, or VERIFIED)
+        dataset: The dataset to use ("lite", "full", or "verified")
         split: The dataset split to use
         offset: Starting index for examples
         length: Number of examples to fetch
+        instance_id: Optional specific instance ID to fetch
 
     Returns:
         List of SweBenchExample objects
     """
-    # Ensure cache directory exists
-    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    # Convert string dataset name to enum
 
     # Load the dataset with caching enabled
-    dataset_name = dataset.value
-    swe_bench_dataset = load_dataset(dataset_name, cache_dir=str(CACHE_DIR), download_mode="reuse_dataset_if_exists")
+    swe_bench_dataset = load_dataset(dataset.value, download_mode="reuse_dataset_if_exists")
 
     # Get the requested split
     split_data = swe_bench_dataset[split]
 
     # Apply offset and length
-    if instance_id:
+    if instance_id or repo:
         offset = 0
         end_idx = len(split_data)
     else:
@@ -113,6 +110,8 @@ def get_swe_bench_examples(
     for row in selected_rows:
         if instance_id and row["instance_id"] != instance_id:
             continue
+        if repo and row["repo"] != repo:
+            continue
         example = SweBenchExample(
             repo=row["repo"],
             instance_id=row["instance_id"],
@@ -129,4 +128,4 @@ def get_swe_bench_examples(
         )
         examples.append(example)
 
-    return examples
+    return examples[:length]
diff --git a/src/codegen/extensions/tools/reflection.py b/src/codegen/extensions/tools/reflection.py
@@ -61,21 +61,29 @@ def render(self) -> str:
 REFLECTION_SYSTEM_PROMPT = """You are an expert AI assistant specialized in reflection and strategic planning.
 Your task is to help organize thoughts, identify knowledge gaps, and create a strategic plan based on the information provided.
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 You will be given:
 1. A summary of the current context and problem being solved
 2. Key information and insights gathered so far
 3. Current obstacles or questions that need to be addressed (if any)
 4. A specific aspect to focus the reflection on (if any)
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 Your response should be structured into the following sections:
 1. Current Understanding - Summarize what you understand about the problem and context
 2. Key Insights - Highlight the most important findings and their implications
 3. Knowledge Gaps (if challenges are provided) - Identify what information is still missing
 4. Action Plan - Recommend specific next steps to move forward
 5. Alternative Approaches - Suggest other ways to tackle the problem
 
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
+
 Your reflection should be clear, insightful, and actionable. Focus on helping the agent make progress and double check its own work.
 You will not suggest the agent writes new tests or modifies existing tests.
+
+**YOU MUST ABSTAIN FROM SUGGESTING THE AGENT WRITES NEW TESTS OR MODIFIES EXISTING TESTS.**
 """
 
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,7 @@ dependencies = [`
`77`	`77`	`"urllib3>=2.0.0",`
`78`	`78`	`"datasets",`
`79`	`79`	`"colorlog>=6.9.0",`
	`80`	`+ "langsmith",`
`80`	`81`	`]`
`81`	`82`
`82`	`83`	`license = { text = "Apache-2.0" }`