Skip to content

chore: better linking between a swebench run and its langsmith trace #761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions codegen-examples/examples/swebench_agent_run/entry_point.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@


@app.function(timeout=43200)
async def run_agent_modal(entry: SweBenchExample):
async def run_agent_modal(entry: SweBenchExample, run_id: str):
"""Modal function to process a single example from the SWE-bench dataset."""
return run_agent_on_entry(entry)
return run_agent_on_entry(entry, run_id=run_id)
14 changes: 7 additions & 7 deletions codegen-examples/examples/swebench_agent_run/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
run_agent_modal = modal.Function.from_name(app_name="swebench-agent-run", name="run_agent_modal")


async def process_batch_modal(examples: list[SweBenchExample], num_workers=5, min_workers=1, max_retries=3):
async def process_batch_modal(examples: list[SweBenchExample], run_id: str, num_workers=5, min_workers=1, max_retries=3):
"""Process a batch of examples concurrently using a queue system with incremental worker scaling.

Args:
Expand Down Expand Up @@ -110,7 +110,7 @@ async def is_rate_limit_error(error):

async def process_example(example, attempt, current_task):
try:
result = await run_agent_modal.remote.aio(example)
result = await run_agent_modal.remote.aio(example, run_id=run_id)

if result is None:
print(f"Warning: Null result for {example.instance_id}")
Expand Down Expand Up @@ -222,7 +222,7 @@ async def worker():
return [results.get(example.instance_id, {"instance_id": example.instance_id, "status": "missing"}) for example in examples]


def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}):
def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebases: dict[str, Codebase] = {}, run_id: str | None = None):
"""Process a batch of examples synchronously.

Args:
Expand All @@ -242,9 +242,9 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase
try:
# Run the agent locally instead of using modal
if codebases and example.instance_id in codebases:
result = run_agent_on_entry(example, codebase=codebases[example.instance_id])
result = run_agent_on_entry(example, codebase=codebases[example.instance_id], run_id=run_id)
else:
result = run_agent_on_entry(example)
result = run_agent_on_entry(example, run_id=run_id)
results.append(result)

except Exception as e:
Expand Down Expand Up @@ -294,9 +294,9 @@ async def run_eval(

# Process all examples in parallel batches
if local:
results = process_batch_local(examples, codebases=codebases)
results = process_batch_local(examples, codebases=codebases, run_id=run_id)
else:
results = await process_batch_modal(examples, num_workers=num_workers)
results = await process_batch_modal(examples, num_workers=num_workers, run_id=run_id)

# Save individual results
for result in results:
Expand Down
30 changes: 27 additions & 3 deletions src/codegen/agents/code_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from langchain.tools import BaseTool
from langchain_core.messages import AIMessage
from langchain_core.runnables.config import RunnableConfig
from langsmith import Client

from codegen.extensions.langchain.agent import create_codebase_agent
Expand All @@ -16,7 +17,17 @@
class CodeAgent:
"""Agent for interacting with a codebase."""

def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", model_name: str = "claude-3-5-sonnet-latest", memory: bool = True, tools: Optional[list[BaseTool]] = None, **kwargs):
def __init__(
self,
codebase: "Codebase",
model_provider: str = "anthropic",
model_name: str = "claude-3-7-sonnet-latest",
memory: bool = True,
tools: Optional[list[BaseTool]] = None,
run_id: Optional[str] = None,
instance_id: Optional[str] = None,
**kwargs,
):
"""Initialize a CodeAgent.

Args:
Expand All @@ -34,6 +45,8 @@ def __init__(self, codebase: "Codebase", model_provider: str = "anthropic", mode
self.codebase = codebase
self.agent = create_codebase_agent(self.codebase, model_provider=model_provider, model_name=model_name, memory=memory, additional_tools=tools, **kwargs)
self.langsmith_client = Client()
self.run_id = run_id
self.instance_id = instance_id

# Get project name from environment variable or use a default
self.project_name = os.environ.get("LANGCHAIN_PROJECT", "RELACE")
Expand All @@ -55,9 +68,20 @@ def run(self, prompt: str, thread_id: Optional[str] = None) -> str:
# this message has a reducer which appends the current message to the existing history
# see more https://langchain-ai.github.io/langgraph/concepts/low_level/#reducers
input = {"messages": [("user", prompt)]}

metadata = {"project": self.project_name}
tags = []
# Add SWEBench run ID and instance ID to the metadata and tags for filtering
if self.run_id is not None:
metadata["swebench_run_id"] = self.run_id
tags.append(self.run_id)

if self.instance_id is not None:
metadata["swebench_instance_id"] = self.instance_id
tags.append(self.instance_id)

config = RunnableConfig(configurable={"thread_id": thread_id}, tags=tags, metadata=metadata, recursion_limit=100)
# we stream the steps instead of invoke because it allows us to access intermediate nodes
stream = self.agent.stream(input, config={"configurable": {"thread_id": thread_id, "metadata": {"project": self.project_name}}, "recursion_limit": 100}, stream_mode="values")
stream = self.agent.stream(input, config=config, stream_mode="values")

# Keep track of run IDs from the stream
run_ids = []
Expand Down
4 changes: 2 additions & 2 deletions src/codegen/extensions/swebench/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def show_problems(dataset):
print(f"{inst}: {problem}")


def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None):
def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None, run_id: str | None = None):
"""Process one `entry` from SWE Bench using the LLM `models` at the
given `temperature`. Set `model_name_or_path` in the result json.
"""
Expand All @@ -70,7 +70,7 @@ def run_agent_on_entry(entry: SweBenchExample, codebase: Codebase | None = None)
)
codebase = Codebase.from_repo(repo_full_name=entry.repo, commit=base_commit, language="python", config=config) # check out the repo

agent = CodeAgent(codebase=codebase)
agent = CodeAgent(codebase=codebase, run_id=run_id, instance_id=instance_id)

pprint.pprint(instance_id)
pprint.pprint(gold_files)
Expand Down
Loading