chore: better linking between a swebench run and its langsmith trace (#761)

jemeza-codegen · web-flow · commit 0af487f15bb0 · 2025-03-05T14:31:35.000-08:00
# Motivation

Currently there isn't a good way to find a langsmith trace given a
run_id and instance_id

# Content

&lt;!-- Please include a summary of the change --&gt;

# Testing

&lt;!-- How was the change tested? --&gt;

# Please check the following before marking your PR as ready for review

- [ ] I have added tests for my changes
- [ ] I have updated the documentation or added new documentation as
needed
diff --git a/codegen-examples/examples/swebench_agent_run/entry_point.py b/codegen-examples/examples/swebench_agent_run/entry_point.py
@@ -14,6 +14,6 @@
 
 
 @app.function(timeout=43200)
-async def run_agent_modal(entry: SweBenchExample):
+async def run_agent_modal(entry: SweBenchExample, run_id: str):
     """Modal function to process a single example from the SWE-bench dataset."""
-    return run_agent_on_entry(entry)
+    return run_agent_on_entry(entry, run_id=run_id)
diff --git a/codegen-examples/examples/swebench_agent_run/run_eval.py b/codegen-examples/examples/swebench_agent_run/run_eval.py
@@ -17,7 +17,7 @@
 run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")
 
 
-async def process_batch_modal(examples: list[SweBenchExample], num_workers=5, min_workers=1, max_retries=3):
+async def process_batch_modal(examples: list[SweBenchExample], run_id: str, num_workers=5, min_workers=1, max_retries=3):
     """Process a batch of examples concurrently using a queue system with incremental worker scaling.
 
     Args:
@@ -110,7 +110,7 @@ async def is_rate_limit_error(error):
 
     async def process_example(example, attempt, current_task):
         try:
-            result = await run_agent_modal.remote.aio(example)
+            result = await run_agent_modal.remote.aio(example, run_id=run_id)
 
             if result is None:
                 print(f"Warning: Null result for {example.instance_id}")
@@ -222,7 +222,7 @@ async def worker():
     return [results.get(example.instance_id, {"instance_id": example.instance_id, "status": "missing"}) for example in examples]
 
 
-def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}):
+def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}, run_id: str | None = None):
     """Process a batch of examples synchronously.
 
     Args:
@@ -242,9 +242,9 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase
             try:
                 # Run the agent locally instead of using modal
                 if codebases and example.instance_id in codebases:
-                    result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
+                    result = run_agent_on_entry(example, codebase=codebases[example.instance_id], run_id=run_id)
                 else:
-                    result = run_agent_on_entry(example)
+                    result = run_agent_on_entry(example, run_id=run_id)
                 results.append(result)
 
             except Exception as e:
@@ -294,9 +294,9 @@ async def run_eval(
 
             # Process all examples in parallel batches
             if local:
-                results = process_batch_local(examples, codebases=codebases)
+                results = process_batch_local(examples, codebases=codebases, run_id=run_id)
             else:
-                results = await process_batch_modal(examples, num_workers=num_workers)
+                results = await process_batch_modal(examples, num_workers=num_workers, run_id=run_id)
 
             # Save individual results
             for result in results:
diff --git a/src/codegen/agents/code_agent.py b/src/codegen/agents/code_agent.py
@@ -4,6 +4,7 @@
 
 from langchain.tools import BaseTool
 from langchain_core.messages import AIMessage
+from langchain_core.runnables.config import RunnableConfig
 from langsmith import Client
 
 from codegen.extensions.langchain.agent import create_codebase_agent
@@ -16,7 +17,17 @@
 class CodeAgent:
     """Agent for interacting with a codebase."""
 
-    def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
+    def __init__(
+        self,
+        codebase: "Codebase",
+        model_provider: str = "anthropic",
+        model_name: str = "claude-3-7-sonnet-latest",
+        memory: bool = True,
+        tools: Optional[list[BaseTool]] = None,
+        run_id: Optional[str] = None,
+        instance_id: Optional[str] = None,
+        **kwargs,
+    ):
         """Initialize a CodeAgent.
 
         Args:
@@ -34,6 +45,8 @@ def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", mode
         self.codebase = codebase
         self.agent = create_codebase_agent(self.codebase, model_provider=model_provider, model_name=model_name, memory=memory, additional_tools=tools, **kwargs)
         self.langsmith_client = Client()
+        self.run_id = run_id
+        self.instance_id = instance_id
 
         # Get project name from environment variable or use a default
         self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE")
@@ -55,9 +68,20 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str:
         # this message has a reducer which appends the current message to the existing history
         # see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
         input = {"messages": [("user", prompt)]}
-
+        metadata = {"project": self.project_name}
+        tags = []
+        # Add SWEBench run ID and instance ID to the metadata and tags for filtering
+        if self.run_id is not None:
+            metadata["swebench_run_id"] = self.run_id
+            tags.append(self.run_id)
+
+        if self.instance_id is not None:
+            metadata["swebench_instance_id"] = self.instance_id
+            tags.append(self.instance_id)
+
+        config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100)
         # we stream the steps instead of invoke because it allows us to access intermediate nodes
-        stream = self.agent.stream(input, config={"configurable": {"thread_id": thread_id, "metadata": {"project": self.project_name}}, "recursion_limit": 100}, stream_mode="values")
+        stream = self.agent.stream(input, config=config, stream_mode="values")
 
         # Keep track of run IDs from the stream
         run_ids = []
diff --git a/src/codegen/extensions/swebench/harness.py b/src/codegen/extensions/swebench/harness.py
@@ -49,7 +49,7 @@ def show_problems(dataset):
         print(f"{inst}: {problem}")
 
 
-def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None):
+def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None, run_id: str | None = None):
     """Process one `entry` from SWE Bench using the LLM `models` at the
     given `temperature`.  Set `model_name_or_path` in the result json.
     """
@@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None)
         )
         codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config)  # check out the repo
 
-    agent = CodeAgent(codebase=codebase)
+    agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id)
 
     pprint.pprint(instance_id)
     pprint.pprint(gold_files)