17
17
run_agent_modal = modal .Function .from_name (app_name = "swebench-agent-run" , name = "run_agent_modal" )
18
18
19
19
20
- async def process_batch_modal (examples : list [SweBenchExample ], run_id : str , num_workers = 5 , min_workers = 1 , max_retries = 3 ):
20
+ async def process_batch_modal (examples : list [SweBenchExample ], run_id : str , model : str , num_workers = 5 , min_workers = 1 , max_retries = 3 ):
21
21
"""Process a batch of examples concurrently using a queue system with incremental worker scaling.
22
22
23
23
Args:
@@ -110,7 +110,7 @@ async def is_rate_limit_error(error):
110
110
111
111
async def process_example (example , attempt , current_task ):
112
112
try :
113
- result = await run_agent_modal .remote .aio (example , run_id = run_id )
113
+ result = await run_agent_modal .remote .aio (example , run_id = run_id , model = model )
114
114
115
115
if result is None :
116
116
print (f"Warning: Null result for { example .instance_id } " )
@@ -222,7 +222,7 @@ async def worker():
222
222
return [results .get (example .instance_id , {"instance_id" : example .instance_id , "status" : "missing" }) for example in examples ]
223
223
224
224
225
- def process_batch_local (examples : list [SweBenchExample ], num_workers = 5 , codebases : dict [str , Codebase ] = {}, run_id : str | None = None ):
225
+ def process_batch_local (examples : list [SweBenchExample ], model : str , num_workers = 5 , codebases : dict [str , Codebase ] = {}, run_id : str | None = None ):
226
226
"""Process a batch of examples synchronously.
227
227
228
228
Args:
@@ -242,9 +242,9 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase
242
242
try :
243
243
# Run the agent locally instead of using modal
244
244
if codebases and example .instance_id in codebases :
245
- result = run_agent_on_entry (example , codebase = codebases [example .instance_id ], run_id = run_id )
245
+ result = run_agent_on_entry (example , model = model , codebase = codebases [example .instance_id ], run_id = run_id )
246
246
else :
247
- result = run_agent_on_entry (example , run_id = run_id )
247
+ result = run_agent_on_entry (example , model = model , run_id = run_id )
248
248
results .append (result )
249
249
250
250
except Exception as e :
@@ -267,7 +267,15 @@ def process_batch_local(examples: list[SweBenchExample], num_workers=5, codebase
267
267
268
268
269
269
async def run_eval (
270
- use_existing_preds : str | None , dataset : str , length : int , instance_id : str | None = None , local : bool = False , codebases : dict [str , Codebase ] = {}, repo : str | None = None , num_workers : int = 5
270
+ use_existing_preds : str | None ,
271
+ dataset : str ,
272
+ length : int ,
273
+ instance_id : str | None = None ,
274
+ local : bool = False ,
275
+ codebases : dict [str , Codebase ] = {},
276
+ repo : str | None = None ,
277
+ num_workers : int = 5 ,
278
+ model : str = "claude-3-7-sonnet-latest" ,
271
279
):
272
280
run_id = use_existing_preds or str (uuid .uuid4 ())
273
281
print (f"Run ID: { run_id } " )
@@ -294,9 +302,9 @@ async def run_eval(
294
302
295
303
# Process all examples in parallel batches
296
304
if local :
297
- results = process_batch_local (examples , codebases = codebases , run_id = run_id )
305
+ results = process_batch_local (examples , model = model , codebases = codebases , run_id = run_id )
298
306
else :
299
- results = await process_batch_modal (examples , num_workers = num_workers , run_id = run_id )
307
+ results = await process_batch_modal (examples , model = model , run_id = run_id , num_workers = num_workers )
300
308
301
309
# Save individual results
302
310
for result in results :
@@ -355,9 +363,11 @@ async def run_eval(
355
363
@click .option (
356
364
"--num-workers" , help = "The number of workers to use. This is the number of examples that will be processed concurrently. A large number may lead to rate limiting issues." , type = int , default = 5
357
365
)
358
- def run_eval_command (use_existing_preds , dataset , length , instance_id , local , repo , num_workers ):
366
+ @click .option ("--model" , help = "The model to use." , type = str , default = "claude-3-7-sonnet-latest" )
367
+ def run_eval_command (use_existing_preds , dataset , length , instance_id , local , repo , num_workers , model ):
359
368
print (f"Repo: { repo } " )
360
- asyncio .run (run_eval (use_existing_preds = use_existing_preds , dataset = dataset , length = length , instance_id = instance_id , codebases = None , local = local , repo = repo , num_workers = num_workers ))
369
+ print (f"Model: { model } " )
370
+ asyncio .run (run_eval (use_existing_preds = use_existing_preds , dataset = dataset , length = length , instance_id = instance_id , codebases = None , local = local , repo = repo , num_workers = num_workers , model = model ))
361
371
362
372
363
373
if __name__ == "__main__" :
0 commit comments