got e2b working on simpy simple examples

justinchiu-cohere · justinchiu-cohere · commit 203d23e27638 · 2025-01-28T01:37:45.000Z
diff --git a/commit0/harness/build.py b/commit0/harness/build.py
@@ -4,7 +4,7 @@
 from datasets import load_dataset
 from typing import Iterator, Union
 
-from commit0.harness.constants import RepoInstance, SimpleInstance, SPLIT
+from commit0.harness.constants import RepoInstance, SimpleInstance, SPLIT, ABSOLUTE_REPO_DIR
 from commit0.harness.docker_build import build_repo_images
 from commit0.harness.spec import make_spec
 
@@ -45,7 +45,7 @@ def main(
             repo_name = example["repo"].split("/")[-1]
             if split != "all" and repo_name not in SPLIT[split]:
                 continue
-        spec = make_spec(example, dataset_type)
+        spec = make_spec(example, dataset_type, DOCKER_REPO_DIR)
         specs.append(spec)
 
     client = docker.from_env()
diff --git a/commit0/harness/constants.py b/commit0/harness/constants.py
@@ -65,6 +65,10 @@ def items(self) -> ItemsView[str, object]:
 
 # Evaluation backends
 EVAL_BACKENDS = ["local", "modal", "e2b"]
+# Use absolute for docker and modal. Backends with sudo access
+ABSOLUTE_REPO_DIR = "/testbed"
+# Use relative for e2b, with no sudo access
+RELATIVE_REPO_DIR = "testbed"
 
 # available commands
 COMMANDS = [
diff --git a/commit0/harness/execution_context.py b/commit0/harness/execution_context.py
@@ -10,8 +10,10 @@
 import modal
 import modal.io_streams
 from enum import auto
+from e2b_code_interpreter import Sandbox
 from strenum import StrEnum
 from pathlib import Path
+import tempfile
 import time
 from typing import Optional, Type
 from types import TracebackType
@@ -220,3 +222,62 @@ def __exit__(
         exctb: Optional[TracebackType],
     ) -> None:
         close_logger(self.logger)
+
+
+class E2B(ExecutionContext):
+    def __init__(
+        self,
+        spec: Spec,
+        logger: logging.Logger,
+        timeout: int,
+        num_cpus: int,
+        log_dir: Path,
+        files_to_copy: Optional[Files] = None,
+        files_to_collect: Optional[list[str]] = None,
+        rebuild_image: bool = False,
+    ):
+        super().__init__(
+            spec,
+            logger,
+            timeout,
+            num_cpus,
+            log_dir,
+            files_to_copy=files_to_copy,
+            files_to_collect=files_to_collect,
+        )
+
+        self.sb = Sandbox(timeout=timeout)
+        self.sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh")
+
+        # setup sandbox env
+        self.sb.files.write("setup.sh", spec.setup_script)
+        self.sb.commands.run("bash setup.sh")
+
+        # prepare for eval
+        if files_to_copy:
+            for _, f in files_to_copy.items():
+                with open(f["src"], "r") as fp:
+                    content = fp.read()
+                    self.sb.files.write(f["dest"].name, content)  # type: ignore
+
+    def exec_run_with_timeout(self, command: str) -> tuple[str, bool, float]:
+        """Execute command on E2B sandbox"""
+        # TODO: setup timeout
+        start_time = time.time()
+        result = self.sb.commands.run(command)
+        return_code = result.exit_code
+        for fname in self.files_to_collect:
+            with (self.log_dir / fname).open("w") as f:
+                f.write(self.sb.files.read(f"testbed/{fname}"))
+        timed_out = False # TODO: figure this out
+        end_time = time.time()
+        return result.stderr, timed_out, end_time - start_time
+
+    def __exit__(
+        self,
+        exctype: Optional[Type[BaseException]],
+        excinst: Optional[BaseException],
+        exctb: Optional[TracebackType],
+    ) -> None:
+        self.sb.kill()
+        close_logger(self.logger)
diff --git a/commit0/harness/run_pytest_ids.py b/commit0/harness/run_pytest_ids.py
@@ -7,8 +7,10 @@
 
 from typing import Iterator, Union
 from commit0.harness.constants import (
+    ABSOLUTE_REPO_DIR,
     EVAL_BACKENDS,
     Files,
+    RELATIVE_REPO_DIR,
     RUN_PYTEST_LOG_DIR,
     RepoInstance,
     SimpleInstance,
@@ -53,6 +55,7 @@ def main(
         dataset_name, split=dataset_split
     )  # type: ignore
     dataset_name = dataset_name.lower()
+    absolute = backend != "e2b"
     spec = None
     example = None
     repo_name = None
@@ -77,7 +80,7 @@ def main(
         if repo_name in os.path.basename(repo_or_repo_dir) or repo_or_repo_dir.endswith(
             repo_name
         ):
-            spec = make_spec(example, dataset_type)
+            spec = make_spec(example, dataset_type, absolute)
             break
     assert spec is not None, "No spec available"
     assert example is not None, "No example available"
@@ -188,22 +191,22 @@ def main(
 
     backend = backend.upper()
     if ExecutionBackend(backend) == ExecutionBackend.MODAL:
-        logger.info("Runnning on Modal")
+        logger.info("Running on Modal")
         execution_context = Modal
     elif ExecutionBackend(backend) == ExecutionBackend.LOCAL:
-        logger.info("Runnning locally")
+        logger.info("Running locally")
         execution_context = Docker
     elif ExecutionBackend(backend) == ExecutionBackend.E2B:
-        logger.info("Runnning E2B")
+        logger.info("Running E2B")
         execution_context = E2B
     else:
         raise ValueError(
             f"Evaluation must be from {', '.join(EVAL_BACKENDS)}, but {backend} is provided."
         )
 
     files_to_copy = Files(
-        eval_script={"src": eval_file, "dest": Path("/eval.sh")},
-        patch={"src": patch_file, "dest": Path("/patch.diff")},
+        eval_script={"src": eval_file, "dest": Path("/eval.sh" if absolute else "eval.sh")},
+        patch={"src": patch_file, "dest": Path("/patch.diff" if absolute else "patch.diff")},
     )
     files_to_collect = [
         "report.json",
@@ -213,6 +216,8 @@ def main(
     if coverage:
         files_to_collect.append("coverage.json")
 
+
+    eval_command = "/bin/bash /eval.sh" if ExecutionBackend(backend) != ExecutionBackend.E2B else "/bin/bash eval.sh"
     try:
         with execution_context(
             spec,
@@ -225,7 +230,7 @@ def main(
             rebuild_image,
         ) as context:
             output, timed_out, total_runtime = context.exec_run_with_timeout(
-                "/bin/bash /eval.sh"
+                eval_command
             )
             logger.info(output)
             if timed_out:
diff --git a/commit0/harness/spec.py b/commit0/harness/spec.py
@@ -4,6 +4,8 @@
 from typing import Union, cast, Optional
 
 from commit0.harness.constants import (
+    ABSOLUTE_REPO_DIR,
+    RELATIVE_REPO_DIR,
     RepoInstance,
     SimpleInstance,
 )
@@ -17,6 +19,7 @@
 class Spec(ABC):
     """A dataclass that represents a test specification for a single instance of SWE-bench."""
 
+    absolute: bool
     repo: str
     # repo dir on docker
     repo_directory: str
@@ -164,11 +167,12 @@ def make_repo_script_list(self) -> list[str]:
 
     def make_eval_script_list(self) -> list[str]:
         """Run the tests."""
+        diff_path = "/patch.diff" if self.absolute else "../patch.diff"
         eval_script_list = [
             f"cd {self.repo_directory}",
             "source .venv/bin/activate",
             f"git reset --hard {self.instance['base_commit']}",
-            "git apply --allow-empty -v /patch.diff",
+            f"git apply --allow-empty -v {diff_path}",
             "git status",
             f"{self.instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors{{coverage}} {{test_ids}} > test_output.txt 2>&1",
             "echo $? > pytest_exit_code.txt",
@@ -306,39 +310,43 @@ def make_eval_script_list(self) -> list[str]:
 def get_specs_from_dataset(
     dataset: Union[list[Union[RepoInstance, SimpleInstance]], list[Spec]],
     dataset_type: str,
+    absolute: bool,
 ) -> list[Spec]:
     """Idempotent function that converts a list of RepoInstance objects to a list of Spec objects."""
     if isinstance(dataset[0], Spec):
         return cast(list[Spec], dataset)
     return list(
         map(
-            lambda instance: make_spec(instance, dataset_type),
+            lambda instance: make_spec(instance, dataset_type, absolute),
             cast(list["RepoInstance"], dataset),
         )
     )
 
 
-def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str) -> Spec:
+def make_spec(instance: Union[RepoInstance, SimpleInstance], dataset_type: str, absolute: bool) -> Spec:
+    repo_directory = ABSOLUTE_REPO_DIR if absolute else RELATIVE_REPO_DIR
     if isinstance(instance, Spec):
         return instance
-    repo_directory = "/testbed"
     if dataset_type == "commit0":
         return Commit0Spec(
             repo=instance["instance_id"],
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     elif dataset_type == "swebench":
         return SWEBenchSpec(
             repo=instance["instance_id"],
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     elif dataset_type == "simple":
         return SimpleSpec(
             repo="simple",  # all benchmarks with mere function writing will share the simple docker image
             repo_directory=repo_directory,
             instance=instance,
+            absolute=absolute,
         )
     else:
         raise NotImplementedError(
diff --git a/test_e2b.py b/test_e2b.py
@@ -3,7 +3,16 @@
 from e2b_code_interpreter import Sandbox
 
 sb = Sandbox()
-sb.commands.run("pip install commit0")
-sb.commands.run("commit0 setup tinydb")
+# install uv
+sb.commands.run("curl -LsSf https://astral.sh/uv/install.sh | sh")
+sb.commands.run("pip install git+https://github.com/commit-0/commit0.git@justin/e2b")
+# run setup script
+# copy diff
+# run eval script
+execution = sb.commands.run("commit0 setup tinydb")
+print(execution.stdout)
+execution = sb.commands.run("commit0 test simpy tests/test_event.py::test_succeed --reference --backend e2b")
+print(execution.stdout)
+execution = sb.commands.run("commit0 test simpy tests/test_event.py::test_succeed --backend e2b")
+print(execution.stdout)
 import pdb; pdb.set_trace()
-k