Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 67 additions & 57 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,74 +330,84 @@ def evaluate_instance(
delete_on_close=True,
)

logger.info("repo_path: %s", repo_path)
cp_testebed_repo = workspace.execute_command(
(f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}")
)
assert cp_testebed_repo.exit_code == 0, (
f"cp_testebed_repo failed: {cp_testebed_repo.stderr}"
)

# git reset
git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard")
assert git_reset.exit_code == 0, f"git reset failed: {git_reset.stderr}"
try:
logger.info("repo_path: %s", repo_path)
cp_testebed_repo = workspace.execute_command(
(f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}")
)
assert cp_testebed_repo.exit_code == 0, (
f"cp_testebed_repo failed: {cp_testebed_repo.stderr}"
)

instruction = get_instruction(
instance=instance.data,
metadata=self.metadata,
workspace_path=workspace.working_dir,
)
with workspace_keepalive(self.metadata.agent_type, workspace):
conversation.send_message(instruction)
# Run conversation with fake user responses to handle agent messages
run_conversation_with_fake_user_response(conversation)

# git add
workspace.execute_command(f"cd {repo_path} ; git add -A")

# git commit
# Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
workspace.execute_command(
f"cd {repo_path} && "
f"git config --global user.email '{constants.GIT_USER_EMAIL}' && "
f"git config --global user.name '{constants.GIT_USER_NAME}' && "
f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'"
)
# git reset
git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard")
assert git_reset.exit_code == 0, f"git reset failed: {git_reset.stderr}"

# Get git patch
base_commit = instance.data["base_commit"]
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
)
assert git_patch_result.exit_code == 0, (
f"git diff failed: {git_patch_result.stderr}"
)
git_patch = git_patch_result.stdout
instruction = get_instruction(
instance=instance.data,
metadata=self.metadata,
workspace_path=workspace.working_dir,
)
with workspace_keepalive(self.metadata.agent_type, workspace):
conversation.send_message(instruction)
# Run conversation with fake user responses to handle agent messages
run_conversation_with_fake_user_response(conversation)

# git add
workspace.execute_command(f"cd {repo_path} ; git add -A")

# git commit
# Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
workspace.execute_command(
f"cd {repo_path} && "
f"git config --global user.email '{constants.GIT_USER_EMAIL}' && "
f"git config --global user.name '{constants.GIT_USER_NAME}' && "
f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'"
)

# Log instance summary
summarize_instance(
instance_id=instance.id,
conversation=conversation,
git_patch=git_patch,
logger=logger,
)
# Get git patch
base_commit = instance.data["base_commit"]
git_patch_result = workspace.execute_command(
(f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
)
assert git_patch_result.exit_code == 0, (
f"git diff failed: {git_patch_result.stderr}"
)
git_patch = git_patch_result.stdout

# Log instance summary
summarize_instance(
instance_id=instance.id,
conversation=conversation,
git_patch=git_patch,
logger=logger,
)

# Build test_result with git patch and optional ACP agent metadata
test_result: dict[str, Any] = {
"git_patch": git_patch,
}
if isinstance(agent, ACPAgent):
add_acp_agent_metadata(test_result, conversation)
# Build test_result with git patch and optional ACP agent metadata
test_result: dict[str, Any] = {
"git_patch": git_patch,
}
if isinstance(agent, ACPAgent):
add_acp_agent_metadata(test_result, conversation)

history = list(conversation.state.events)
metrics = conversation.conversation_stats.get_combined_metrics()
finally:
# Break the circular reference (RemoteConversation → WebSocket callback →
# run_complete_callback closure → self) so CPython's reference counter can
# reclaim the object immediately rather than waiting for the cyclic GC.
# Without this, ACP conversations with large event histories (no condenser,
# many tool calls) accumulate across instances and OOM the eval-container.
conversation.close()

# EvalOutput is your model; keep fields consistent with prior JSONL
out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result=test_result,
instruction=instruction,
error=None,
history=list(conversation.state.events),
metrics=conversation.conversation_stats.get_combined_metrics(),
history=history,
metrics=metrics,
)
return out

Expand Down
Loading
Loading