codegen-sh
diff --git a/‎codegen-examples/examples/swebench_agent_run/README.md
Lines changed: 10 additions & 9 deletions b/‎codegen-examples/examples/swebench_agent_run/README.md
Lines changed: 10 additions & 9 deletions
diff --git a/‎codegen-examples/examples/swebench_agent_run/pyproject.toml
Lines changed: 4 additions & 1 deletion b/‎codegen-examples/examples/swebench_agent_run/pyproject.toml
Lines changed: 4 additions & 1 deletion
diff --git a/‎codegen-examples/examples/swebench_agent_run/run_eval.py
Lines changed: 13 additions & 11 deletions b/‎codegen-examples/examples/swebench_agent_run/run_eval.py
Lines changed: 13 additions & 11 deletions
@@ -1,32 +1,33 @@
 # INSTRUCTIONS
 
-1. Create a `.env` file in the root directory and add your API keys.
+1. Create a `.env` file in the `swebench_agent_run` directory (codegen-examples/examples/swebench_agent_run) and add your API keys.
 
 1. cd into the `codegen-examples/examples/swebench_agent_run` directory
 
 1. Create a `.venv` with `uv venv` and activate it with `source .venv/bin/activate`
 
+1. Install the dependencies with `uv pip install .`
+
 1. Install the codegen dependencies with `uv add codegen`
 
-- Note: If you'd like to install the dependencies in the global environment, you can use `uv pip install -e ../../../`. This will allow you to test modifications to the codegen codebase. You will need to run `uv pip install -e ../../../` each time you make changes to the codebase.
+- Note: If you'd like to install the dependencies using the global environment, use `uv pip install -e ../../../` instead of `uv pip install .`. This will allow you to test modifications to the codegen codebase. You will need to run `uv pip install -e ../../../` each time you make changes to the codebase.
 
-5. Ensure that you have a modal account and profile set up. If you don't have one, you can create one at https://modal.com/
+6. Ensure that you have a modal account and profile set up. If you don't have one, you can create one at https://modal.com/
 
-1. Activate the appropriate modal profile `uv modal profile activate <profile_name>`
+1. Activate the appropriate modal profile `python -m modal profile activate <profile_name>`
 
-1. Launch the modal app with `uv run modal deploy --env=<env_name> entry_point.py`
+1. Launch the modal app with `python -m modal deploy --env=<env_name> entry_point.py`
 
-1. Run the evaluation with `python run_eval.py` with the desired options:
+1. Run the evaluation with `python -m run_eval` with the desired options:
 
 - ```bash
   $ python run_eval.py --help
   Usage: run_eval.py [OPTIONS]
 
   Options:
-  --use-existing-preds            Use existing predictions instead of
-                                generating new ones.
+  --use-existing-preds TEXT       The run ID of the existing predictions to use.
   --dataset [princeton-nlp/SWE-bench_Lite|princeton-nlp/SWE-bench|princeton-nlp/SWE-bench-verified]
-                                The dataset to use.
+                                  The dataset to use.
   --length INTEGER                The number of examples to process.
   --instance-id TEXT              The instance ID of the example to process.
   --help                          Show this message and exit.
 
@@ -4,4 +4,7 @@ version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12, <3.14"
-dependencies = []
+dependencies = ["swebench>=3.0.0", "modal>=0.73.25"]
+
+[tool.setuptools]
+py-modules = ["entry_point", "run_eval"]
@@ -2,6 +2,7 @@
 import json
 import traceback
 from pathlib import Path
+import uuid
 import modal
 import click
 from datetime import datetime
@@ -87,21 +88,21 @@ async def process_batch(examples, batch_size=10):
     return results
 
 
-async def run_eval(use_existing_preds, dataset, length, instance_id=None):
+async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
+    run_id = use_existing_preds or str(uuid.uuid4())
+    predictions_dir = PREDS_DNAME / f"results_{run_id}"
     dataset = SWEBenchDataset(dataset)
     if instance_id:
         examples = [get_swe_bench_example(instance_id, dataset=dataset)]
     else:
         examples = get_swe_bench_examples(dataset=dataset, length=length)
 
     try:
-        if not use_existing_preds:
+        if use_existing_preds is None:
             print(f"Processing {len(examples)} examples...")
 
             # Create output directory if it doesn't exist
-            PREDS_DNAME.mkdir(exist_ok=True)
-            results_dir = PREDS_DNAME / "results"
-            results_dir.mkdir(exist_ok=True)
+            predictions_dir.mkdir(exist_ok=True, parents=True)
 
             # Create a timestamp for this run
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -113,12 +114,13 @@ async def run_eval(use_existing_preds, dataset, length, instance_id=None):
             for result in results:
                 if result and "instance_id" in result:
                     instance_id = result["instance_id"]
-                    output_file = results_dir / f"{instance_id}.json"
+                    output_file = predictions_dir / f"{instance_id}.json"
+                    output_file.parent.mkdir(exist_ok=True, parents=True)
                     with open(output_file, "w") as f:
                         json.dump(result, f, indent=4)
 
             # Save summary file
-            summary_file = results_dir / f"summary_{timestamp}.json"
+            summary_file = predictions_dir / f"summary_{timestamp}.json"
             summary = {
                 "timestamp": timestamp,
                 "total_examples": len(examples),
@@ -138,7 +140,7 @@ async def run_eval(use_existing_preds, dataset, length, instance_id=None):
                 json.dump(summary, f, indent=4)
 
             print("\nProcessing complete!")
-            print(f"Results saved to: {results_dir}")
+            print(f"Results saved to: {predictions_dir}")
             print(f"Summary saved to: {summary_file}")
             print(f"Successful: {summary['successful']}/{summary['total_examples']}")
             print(f"Failed: {summary['failed']}/{summary['total_examples']}")
@@ -148,18 +150,18 @@ async def run_eval(use_existing_preds, dataset, length, instance_id=None):
                     print(f"  {error_type}: {count}")
 
         # Generate Report on Modal
-        generate_report(PREDS_DNAME, LOG_DIR, dataset)
+        generate_report(predictions_dir, LOG_DIR, dataset, run_id)
     except Exception:
         print("Fatal error in run_eval:")
         traceback.print_exc()
         raise
 
 
 @click.command()
-@click.option("--use-existing-preds", is_flag=True, help="Use existing predictions instead of generating new ones.")
+@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
 @click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
 @click.option("--length", help="The number of examples to process.", type=int, default=10)
-@click.option("--instance-id", help="The instance ID of the example to process.")
+@click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
 def run_eval_command(use_existing_preds, dataset, length, instance_id):
     asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))