Skip to content

Commit 0af487f

Browse files
chore: better linking between a swebench run and its langsmith trace (#761)
# Motivation Currently there isn't a good way to find a langsmith trace given a run_id and instance_id # Content <!-- Please include a summary of the change --> # Testing <!-- How was the change tested? --> # Please check the following before marking your PR as ready for review - [ ] I have added tests for my changes - [ ] I have updated the documentation or added new documentation as needed
1 parent 16a114b commit 0af487f

File tree

4 files changed

+38
-14
lines changed

4 files changed

+38
-14
lines changed

codegen-examples/examples/swebench_agent_run/entry_point.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,6 @@
1414

1515

1616
@app.function(timeout=43200)
17-
async def run_agent_modal(entry: SweBenchExample):
17+
async def run_agent_modal(entry: SweBenchExample, run_id: str):
1818
"""Modal function to process a single example from the SWE-bench dataset."""
19-
return run_agent_on_entry(entry)
19+
return run_agent_on_entry(entry, run_id=run_id)

codegen-examples/examples/swebench_agent_run/run_eval.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")
1818

1919

20-
async def process_batch_modal(examples: list[SweBenchExample], num_workers=5, min_workers=1, max_retries=3):
20+
async def process_batch_modal(examples: list[SweBenchExample], run_id: str, num_workers=5, min_workers=1, max_retries=3):
2121
"""Process a batch of examples concurrently using a queue system with incremental worker scaling.
2222
2323
Args:
@@ -110,7 +110,7 @@ async def is_rate_limit_error(error):
110110

111111
async def process_example(example, attempt, current_task):
112112
try:
113-
result = await run_agent_modal.remote.aio(example)
113+
result = await run_agent_modal.remote.aio(example, run_id=run_id)
114114

115115
if result is None:
116116
print(f"Warning: Null result for {example.instance_id}")
@@ -222,7 +222,7 @@ async def worker():
222222
return [results.get(example.instance_id, {"instance_id": example.instance_id, "status": "missing"}) for example in examples]
223223

224224

225-
def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}):
225+
def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}, run_id: str | None = None):
226226
"""Process a batch of examples synchronously.
227227
228228
Args:
@@ -242,9 +242,9 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase
242242
try:
243243
# Run the agent locally instead of using modal
244244
if codebases and example.instance_id in codebases:
245-
result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
245+
result = run_agent_on_entry(example, codebase=codebases[example.instance_id], run_id=run_id)
246246
else:
247-
result = run_agent_on_entry(example)
247+
result = run_agent_on_entry(example, run_id=run_id)
248248
results.append(result)
249249

250250
except Exception as e:
@@ -294,9 +294,9 @@ async def run_eval(
294294

295295
# Process all examples in parallel batches
296296
if local:
297-
results = process_batch_local(examples, codebases=codebases)
297+
results = process_batch_local(examples, codebases=codebases, run_id=run_id)
298298
else:
299-
results = await process_batch_modal(examples, num_workers=num_workers)
299+
results = await process_batch_modal(examples, num_workers=num_workers, run_id=run_id)
300300

301301
# Save individual results
302302
for result in results:

src/codegen/agents/code_agent.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from langchain.tools import BaseTool
66
from langchain_core.messages import AIMessage
7+
from langchain_core.runnables.config import RunnableConfig
78
from langsmith import Client
89

910
from codegen.extensions.langchain.agent import create_codebase_agent
@@ -16,7 +17,17 @@
1617
class CodeAgent:
1718
"""Agent for interacting with a codebase."""
1819

19-
def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
20+
def __init__(
21+
self,
22+
codebase: "Codebase",
23+
model_provider: str = "anthropic",
24+
model_name: str = "claude-3-7-sonnet-latest",
25+
memory: bool = True,
26+
tools: Optional[list[BaseTool]] = None,
27+
run_id: Optional[str] = None,
28+
instance_id: Optional[str] = None,
29+
**kwargs,
30+
):
2031
"""Initialize a CodeAgent.
2132
2233
Args:
@@ -34,6 +45,8 @@ def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", mode
3445
self.codebase = codebase
3546
self.agent = create_codebase_agent(self.codebase, model_provider=model_provider, model_name=model_name, memory=memory, additional_tools=tools, **kwargs)
3647
self.langsmith_client = Client()
48+
self.run_id = run_id
49+
self.instance_id = instance_id
3750

3851
# Get project name from environment variable or use a default
3952
self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE")
@@ -55,9 +68,20 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str:
5568
# this message has a reducer which appends the current message to the existing history
5669
# see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
5770
input = {"messages": [("user", prompt)]}
58-
71+
metadata = {"project": self.project_name}
72+
tags = []
73+
# Add SWEBench run ID and instance ID to the metadata and tags for filtering
74+
if self.run_id is not None:
75+
metadata["swebench_run_id"] = self.run_id
76+
tags.append(self.run_id)
77+
78+
if self.instance_id is not None:
79+
metadata["swebench_instance_id"] = self.instance_id
80+
tags.append(self.instance_id)
81+
82+
config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100)
5983
# we stream the steps instead of invoke because it allows us to access intermediate nodes
60-
stream = self.agent.stream(input, config={"configurable": {"thread_id": thread_id, "metadata": {"project": self.project_name}}, "recursion_limit": 100}, stream_mode="values")
84+
stream = self.agent.stream(input, config=config, stream_mode="values")
6185

6286
# Keep track of run IDs from the stream
6387
run_ids = []

src/codegen/extensions/swebench/harness.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def show_problems(dataset):
4949
print(f"{inst}: {problem}")
5050

5151

52-
def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None):
52+
def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None, run_id: str | None = None):
5353
"""Process one `entry` from SWE Bench using the LLM `models` at the
5454
given `temperature`. Set `model_name_or_path` in the result json.
5555
"""
@@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None)
7070
)
7171
codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config) # check out the repo
7272

73-
agent = CodeAgent(codebase=codebase)
73+
agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id)
7474

7575
pprint.pprint(instance_id)
7676
pprint.pprint(gold_files)

0 commit comments

Comments
 (0)