1 年之前 · 463d4e9a46
--- a/.gitignore
+++ b/.gitignore
@@ -175,6 +175,7 @@ evaluation/gaia/data
 
				 evaluation/gorilla/data
			
 
				 evaluation/toolqa/data
			
 
				 evaluation/scienceagentbench/benchmark
			
 
				+evaluation/commit0_bench/repos
			
 
				 
			
 
				 # openhands resolver
			
 
				 output/
			
--- a/evaluation/commit0_bench/README.md
+++ b/evaluation/commit0_bench/README.md
@@ -0,0 +1,82 @@
 
				+# Commit0 Evaluation with OpenHands
			
 
				+
			
 
				+This folder contains the evaluation harness that we built on top of the original [Commit0](https://commit-0.github.io/) ([paper](TBD)).
			
 
				+
			
 
				+The evaluation consists of three steps:
			
 
				+
			
 
				+1. Environment setup: [install python environment](../README.md#development-environment), [configure LLM config](../README.md#configure-openhands-and-your-llm).
			
 
				+2. [Run Evaluation](#run-inference-on-commit0-instances): Generate a edit patch for each Commit0 Repo, and get the evaluation results
			
 
				+
			
 
				+## Setup Environment and LLM Configuration
			
 
				+
			
 
				+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
			
 
				+
			
 
				+## OpenHands Commit0 Instance-level Docker Support
			
 
				+
			
 
				+OpenHands supports using the Commit0 Docker for **[inference](#run-inference-on-commit0-instances).
			
 
				+This is now the default behavior.
			
 
				+
			
 
				+
			
 
				+## Run Inference on Commit0 Instances
			
 
				+
			
 
				+Make sure your Docker daemon is running, and you have ample disk space (at least 200-500GB, depends on the Commit0 set you are running on) for the [instance-level docker image](#openhands-commit0-instance-level-docker-support).
			
 
				+
			
 
				+When the `run_infer.sh` script is started, it will automatically pull the `lite` split in Commit0. For example, for instance ID `commit-0/minitorch`, it will try to pull our pre-build docker image `wentingzhao/minitorch` from DockerHub. This image will be used create an OpenHands runtime image where the agent will operate on.
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+
			
 
				+# Example
			
 
				+./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 16 100 8 wentingzhao/commit0_combined test
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, and the rest are optional.
			
 
				+
			
 
				+- `repo_split`, e.g. `lite`, is the split of the Commit0 dataset you would like to evaluate on. Available options are `lite`, `all` and each individual repo.
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
			
 
				+default, the script evaluates the `lite` split of the Commit0 dataset (16 repos). Note:
			
 
				+in order to use `eval_limit`, you must also set `agent`.
			
 
				+- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
			
 
				+default, it is set to 30.
			
 
				+- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
			
 
				+default, it is set to 1.
			
 
				+- `dataset`, a huggingface dataset name. e.g. `wentingzhao/commit0_combined`, specifies which dataset to evaluate on.
			
 
				+- `dataset_split`, split for the huggingface dataset. Notice only `test` is supported for Commit0.
			
 
				+
			
 
				+Note that the `USE_INSTANCE_IMAGE` environment variable is always set to `true` for Commit0.
			
 
				+
			
 
				+Let's say you'd like to run 10 instances using `llm.eval_sonnet` and CodeActAgent,
			
 
				+
			
 
				+then your command would be:
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
			
 
				+```
			
 
				+
			
 
				+### Run Inference on `RemoteRuntime` (experimental)
			
 
				+
			
 
				+This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/commit0_bench/scripts/run_infer.sh [repo_split] [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+
			
 
				+# Example - This runs evaluation on CodeActAgent for 10 instances on "wentingzhao/commit0_combined"'s test set, with max 30 iteration per instances, with 1 number of workers running in parallel
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="docker.io/wentingzhao" \
			
 
				+./evaluation/commit0_bench/scripts/run_infer.sh lite llm.eval_sonnet HEAD CodeActAgent 10 30 1 wentingzhao/commit0_combined test
			
 
				+```
			
 
				+
			
 
				+To clean-up all existing runtime you've already started, run:
			
 
				+
			
 
				+```bash
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh
			
 
				+```
			
 
				+
			
 
				+### Specify a subset of tasks to run infer
			
 
				+
			
 
				+If you would like to specify a list of tasks you'd like to benchmark on, you just need to pass selected repo through `repo_split` option.
			
--- a/evaluation/commit0_bench/run_infer.py
+++ b/evaluation/commit0_bench/run_infer.py
@@ -0,0 +1,606 @@
 
				+import asyncio
			
 
				+import json
			
 
				+import os
			
 
				+from collections import Counter
			
 
				+from typing import Any
			
 
				+
			
 
				+import pandas as pd
			
 
				+from commit0.harness.constants import SPLIT
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				+import openhands.agenthub
			
 
				+from evaluation.utils.shared import (
			
 
				+    EvalException,
			
 
				+    EvalMetadata,
			
 
				+    EvalOutput,
			
 
				+    assert_and_raise,
			
 
				+    codeact_user_response,
			
 
				+    make_metadata,
			
 
				+    prepare_dataset,
			
 
				+    reset_logger_for_multiprocessing,
			
 
				+    run_evaluation,
			
 
				+    update_llm_config_for_completions_logging,
			
 
				+)
			
 
				+from openhands.controller.state.state import State
			
 
				+from openhands.core.config import (
			
 
				+    AgentConfig,
			
 
				+    AppConfig,
			
 
				+    SandboxConfig,
			
 
				+    get_llm_config_arg,
			
 
				+    get_parser,
			
 
				+)
			
 
				+from openhands.core.logger import openhands_logger as logger
			
 
				+from openhands.core.main import create_runtime, run_controller
			
 
				+from openhands.events.action import CmdRunAction, MessageAction
			
 
				+from openhands.events.observation import CmdOutputObservation, ErrorObservation
			
 
				+from openhands.events.serialization.event import event_to_dict
			
 
				+from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				+from openhands.utils.shutdown_listener import sleep_if_should_continue
			
 
				+
			
 
				+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
			
 
				+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
			
 
				+RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+    'CodeActCommit0Agent': codeact_user_response,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def _get_commit0_workspace_dir_name(instance: pd.Series) -> str:
			
 
				+    return instance['repo'].split('/')[1]
			
 
				+
			
 
				+
			
 
				+def get_instruction(instance: pd.Series, metadata: EvalMetadata):
			
 
				+    workspace_dir_name = _get_commit0_workspace_dir_name(instance)
			
 
				+    # Prepare instruction
			
 
				+    test_cmd = instance['test']['test_cmd']
			
 
				+    test_dir = instance['test']['test_dir']
			
 
				+    # Instruction based on Anthropic's official trajectory
			
 
				+    # https://github.com/eschluntz/swe-bench-experiments/tree/main/evaluation/verified/20241022_tools_claude-3-5-sonnet-updated/trajs
			
 
				+    instruction = (
			
 
				+        '<uploaded_files>\n'
			
 
				+        f'/workspace/{workspace_dir_name}\n'
			
 
				+        '</uploaded_files>\n'
			
 
				+        f"I've uploaded a python code repository in the directory {workspace_dir_name}. Here is your task:\n\n"
			
 
				+        'Here is your task:\n\n'
			
 
				+        '  You need to complete the implementations for all functions (i.e., those with pass\n'
			
 
				+        '  statements) and pass the unit tests.\n\n'
			
 
				+        '  Do not change the names of existing functions or classes, as they may be referenced\n'
			
 
				+        '  from other code like unit tests, etc.\n\n'
			
 
				+        '  When you generate code, you must maintain the original formatting of the function\n'
			
 
				+        '  stubs (such as whitespaces), otherwise we will not able to search/replace blocks\n'
			
 
				+        '  for code modifications, and therefore you will receive a score of 0 for your generated\n'
			
 
				+        '  code.'
			
 
				+        '\n\n'
			
 
				+        'Here is the command to run the unit tests:\n'
			
 
				+        '<test_command>\n'
			
 
				+        f'{test_cmd} {test_dir}\n'
			
 
				+        '</test_command>\n\n'
			
 
				+        'Make a local git commit for each agent step for all code changes. If there is not change in current step, do not make a commit.'
			
 
				+    )
			
 
				+
			
 
				+    if RUN_WITH_BROWSING:
			
 
				+        instruction += (
			
 
				+            '<IMPORTANT!>\n'
			
 
				+            'You SHOULD NEVER attempt to browse the web. '
			
 
				+            '</IMPORTANT!>\n'
			
 
				+        )
			
 
				+    return instruction
			
 
				+
			
 
				+
			
 
				+# TODO: migrate all swe-bench docker to ghcr.io/openhands
			
 
				+DOCKER_IMAGE_PREFIX = os.environ.get(
			
 
				+    'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/wentingzhao/'
			
 
				+)
			
 
				+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
			
 
				+
			
 
				+
			
 
				+def get_instance_docker_image(repo_name: str) -> str:
			
 
				+    return (DOCKER_IMAGE_PREFIX.rstrip('/') + '/' + repo_name).lower() + ':v0'
			
 
				+
			
 
				+
			
 
				+def get_config(
			
 
				+    instance: pd.Series,
			
 
				+    metadata: EvalMetadata,
			
 
				+) -> AppConfig:
			
 
				+    # COMMIT0_CONTAINER_IMAGE = 'wentingzhao/'
			
 
				+    assert USE_INSTANCE_IMAGE
			
 
				+    # We use a different instance image for the each instance of commit0 eval
			
 
				+    repo_name = instance['repo'].split('/')[1]
			
 
				+    base_container_image = get_instance_docker_image(repo_name)
			
 
				+    logger.info(
			
 
				+        f'Using instance container image: {base_container_image}. '
			
 
				+        f'Please make sure this image exists. '
			
 
				+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
			
 
				+    )
			
 
				+    # else:
			
 
				+    #     raise
			
 
				+    # base_container_image = SWE_BENCH_CONTAINER_IMAGE
			
 
				+    # logger.info(f'Using swe-bench container image: {base_container_image}')
			
 
				+
			
 
				+    config = AppConfig(
			
 
				+        default_agent=metadata.agent_class,
			
 
				+        run_as_openhands=False,
			
 
				+        max_iterations=metadata.max_iterations,
			
 
				+        runtime=os.environ.get('RUNTIME', 'eventstream'),
			
 
				+        sandbox=SandboxConfig(
			
 
				+            base_container_image=base_container_image,
			
 
				+            enable_auto_lint=True,
			
 
				+            use_host_network=False,
			
 
				+            # large enough timeout, since some testcases take very long to run
			
 
				+            timeout=300,
			
 
				+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            keep_runtime_alive=False,
			
 
				+            remote_runtime_init_timeout=3600,
			
 
				+        ),
			
 
				+        # do not mount workspace
			
 
				+        workspace_base=None,
			
 
				+        workspace_mount_path=None,
			
 
				+    )
			
 
				+    config.set_llm_config(
			
 
				+        update_llm_config_for_completions_logging(
			
 
				+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
			
 
				+        )
			
 
				+    )
			
 
				+    agent_config = AgentConfig(
			
 
				+        codeact_enable_jupyter=False,
			
 
				+        codeact_enable_browsing=RUN_WITH_BROWSING,
			
 
				+        codeact_enable_llm_editor=False,
			
 
				+    )
			
 
				+    config.set_agent_config(agent_config)
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+def initialize_runtime(
			
 
				+    runtime: Runtime,
			
 
				+    instance: pd.Series,  # this argument is not required
			
 
				+):
			
 
				+    """Initialize the runtime for the agent.
			
 
				+
			
 
				+    This function is called before the runtime is used to run the agent.
			
 
				+    """
			
 
				+    logger.info('-' * 30)
			
 
				+    logger.info('BEGIN Runtime Initialization Fn')
			
 
				+    logger.info('-' * 30)
			
 
				+    workspace_dir_name = _get_commit0_workspace_dir_name(instance)
			
 
				+    obs: CmdOutputObservation
			
 
				+
			
 
				+    action = CmdRunAction(
			
 
				+        command=f'git clone -b commit0_combined https://github.com/{instance["repo"]}.git'
			
 
				+    )
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        obs.exit_code == 0,
			
 
				+        f'Failed to git clone -b commit0_combined https://github.com/{instance["repo"]}.git: {str(obs)}',
			
 
				+    )
			
 
				+
			
 
				+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        obs.exit_code == 0,
			
 
				+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
			
 
				+    )
			
 
				+
			
 
				+    action = CmdRunAction(command='git checkout -b openhands')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        obs.exit_code == 0, f'Failed to git checkout new branch openhands: {str(obs)}'
			
 
				+    )
			
 
				+
			
 
				+    # Install commit0
			
 
				+    action = CmdRunAction(command='/root/.cargo/bin/uv pip install commit0')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        obs.exit_code == 0,
			
 
				+        f'Failed to install commit0: {str(obs)}',
			
 
				+    )
			
 
				+    logger.info('-' * 30)
			
 
				+    logger.info('END Runtime Initialization Fn')
			
 
				+    logger.info('-' * 30)
			
 
				+
			
 
				+
			
 
				+def complete_runtime(
			
 
				+    runtime: Runtime,
			
 
				+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
			
 
				+) -> dict[str, Any]:
			
 
				+    """Complete the runtime for the agent.
			
 
				+
			
 
				+    This function is called before the runtime is used to run the agent.
			
 
				+    If you need to do something in the sandbox to get the correctness metric after
			
 
				+    the agent has run, modify this function.
			
 
				+    """
			
 
				+    logger.info('-' * 30)
			
 
				+    logger.info('BEGIN Runtime Completion Fn')
			
 
				+    logger.info('-' * 30)
			
 
				+    obs: CmdOutputObservation
			
 
				+    workspace_dir_name = _get_commit0_workspace_dir_name(instance)
			
 
				+
			
 
				+    action = CmdRunAction(command='git add .')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
			
 
				+        f'Failed to git add -A: {str(obs)}',
			
 
				+    )
			
 
				+
			
 
				+    action = CmdRunAction(command='git commit -m "openhands edits"')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation)
			
 
				+        and (obs.exit_code == 0 or obs.exit_code == 1),
			
 
				+        f'Failed to git commit -m "openhands": {str(obs)}',
			
 
				+    )
			
 
				+
			
 
				+    # Generate diff patch compared to base commit, excluding spec.pdf.bz2 files
			
 
				+    n_retries = 0
			
 
				+    git_patch = None
			
 
				+    while n_retries < 5:
			
 
				+        action = CmdRunAction(
			
 
				+            command=f"git diff {instance['base_commit']} HEAD -- . ':(exclude)spec.pdf.bz2'"
			
 
				+        )
			
 
				+        action.timeout = 600 + 100 * n_retries
			
 
				+        logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+        obs = runtime.run_action(action)
			
 
				+        # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+        n_retries += 1
			
 
				+        if isinstance(obs, CmdOutputObservation):
			
 
				+            if obs.exit_code == 0:
			
 
				+                git_patch = obs.content.strip()
			
 
				+                break
			
 
				+            else:
			
 
				+                logger.info('Failed to get git diff, retrying...')
			
 
				+                sleep_if_should_continue(10)
			
 
				+        elif isinstance(obs, ErrorObservation):
			
 
				+            logger.error(f'Error occurred: {obs.content}. Retrying...')
			
 
				+            sleep_if_should_continue(10)
			
 
				+        else:
			
 
				+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
			
 
				+
			
 
				+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
			
 
				+
			
 
				+    test_dir = instance['test']['test_dir']
			
 
				+    action = CmdRunAction(
			
 
				+        command=f"{instance['test']['test_cmd']} --json-report --json-report-file=report.json --continue-on-collection-errors {test_dir} > test_output.txt 2>&1"
			
 
				+    )
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation),
			
 
				+        f'Failed to run test command: {str(obs)}',
			
 
				+    )
			
 
				+    # Read test output
			
 
				+    action = CmdRunAction(command='cat test_output.txt')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation),
			
 
				+        f'Failed to read test output: {str(obs)}',
			
 
				+    )
			
 
				+    test_output = obs.content.strip()
			
 
				+    # logger.info(f'Test output: {test_output}')
			
 
				+
			
 
				+    # Save pytest exit code
			
 
				+    action = CmdRunAction(command='echo $?')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
			
 
				+        f'Failed to save pytest exit code: {str(obs)}',
			
 
				+    )
			
 
				+    pytest_exit_code = obs.content.strip()
			
 
				+    # logger.info(f'Pytest exit code: {pytest_exit_code}')
			
 
				+
			
 
				+    # Read the test report
			
 
				+    action = CmdRunAction(command='cat report.json')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert_and_raise(
			
 
				+        isinstance(obs, CmdOutputObservation),
			
 
				+        f'Failed to read test report: {str(obs)}',
			
 
				+    )
			
 
				+    # Get test IDs from instance
			
 
				+    repo_name = instance['repo'].split('/')[1]
			
 
				+    repo_name = repo_name.replace('.', '-')
			
 
				+    action = CmdRunAction(command=f'commit0 get-tests {repo_name}')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    # logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    test_ids = obs.content.strip().split('\n')
			
 
				+
			
 
				+    try:
			
 
				+        report = json.loads(obs.content)
			
 
				+        tests = {x['nodeid']: x['call'] for x in report['tests'] if 'call' in x}
			
 
				+
			
 
				+        # Calculate test statistics
			
 
				+        status = []
			
 
				+        runtimes = []
			
 
				+        no_runs = 0
			
 
				+
			
 
				+        for test_id in test_ids:
			
 
				+            if test_id in tests and tests[test_id] is not None:
			
 
				+                status.append(tests[test_id]['outcome'])
			
 
				+                runtimes.append(tests[test_id]['duration'])
			
 
				+                no_runs += 1
			
 
				+            else:
			
 
				+                status.append('failed')
			
 
				+                runtimes.append(0)
			
 
				+
			
 
				+        status_counts = Counter(status)
			
 
				+        total_runtime = sum(runtimes) if no_runs > 0 else 0
			
 
				+        num_passed = status_counts.get('passed', 0) + status_counts.get('xfail', 0)
			
 
				+        passed_ratio = num_passed / len(status) if status else 0
			
 
				+
			
 
				+        eval_result = {
			
 
				+            'name': workspace_dir_name,
			
 
				+            'sum': total_runtime,
			
 
				+            'passed': passed_ratio,
			
 
				+            'num_passed': num_passed,
			
 
				+            'num_tests': len(test_ids),
			
 
				+        }
			
 
				+
			
 
				+    except json.JSONDecodeError:
			
 
				+        logger.error('Failed to parse test report JSON')
			
 
				+        eval_result = {
			
 
				+            'name': workspace_dir_name,
			
 
				+            'sum': 0,
			
 
				+            'passed': 0,
			
 
				+            'num_passed': 0,
			
 
				+            'num_tests': len(test_ids),
			
 
				+        }
			
 
				+
			
 
				+    # Create tarball of workspace
			
 
				+    temp_zip = runtime.copy_from(f'/workspace/{workspace_dir_name}')
			
 
				+
			
 
				+    commit0_dir = os.path.dirname(__file__)
			
 
				+    persistent_zip = os.path.join(commit0_dir, f'{workspace_dir_name}.zip')
			
 
				+    with open(temp_zip, 'rb') as src, open(persistent_zip, 'wb') as dst:
			
 
				+        dst.write(src.read())
			
 
				+    zip_file = persistent_zip
			
 
				+    return {
			
 
				+        'eval_result': eval_result,
			
 
				+        'git_patch': git_patch,
			
 
				+        'test_output': test_output,
			
 
				+        'pytest_exit_code': pytest_exit_code,
			
 
				+        'zip_file': zip_file,
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance: pd.Series,
			
 
				+    metadata: EvalMetadata,
			
 
				+    reset_logger: bool = True,
			
 
				+) -> EvalOutput:
			
 
				+    config = get_config(instance, metadata)
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
			
 
				+    if reset_logger:
			
 
				+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
			
 
				+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
			
 
				+    else:
			
 
				+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
			
 
				+
			
 
				+    runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				+    try:
			
 
				+        initialize_runtime(runtime, instance)
			
 
				+
			
 
				+        instruction = get_instruction(instance, metadata)
			
 
				+
			
 
				+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+        state: State | None = asyncio.run(
			
 
				+            run_controller(
			
 
				+                config=config,
			
 
				+                initial_user_action=MessageAction(content=instruction),
			
 
				+                runtime=runtime,
			
 
				+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
			
 
				+                    metadata.agent_class
			
 
				+                ],
			
 
				+            )
			
 
				+        )
			
 
				+
			
 
				+        # if fatal error, throw EvalError to trigger re-run
			
 
				+        if (
			
 
				+            state.last_error
			
 
				+            and 'fatal error during agent execution' in state.last_error
			
 
				+            and 'stuck in a loop' not in state.last_error
			
 
				+        ):
			
 
				+            raise EvalException('Fatal error detected: ' + state.last_error)
			
 
				+
			
 
				+        # ======= THIS IS Commit0 specific =======
			
 
				+        # Get git patch
			
 
				+        return_val = complete_runtime(runtime, instance)
			
 
				+        eval_result = return_val['eval_result']
			
 
				+        git_patch = return_val['git_patch']
			
 
				+        test_output = return_val['test_output']
			
 
				+        pytest_exit_code = return_val['pytest_exit_code']
			
 
				+        zip_file = return_val['zip_file']
			
 
				+
			
 
				+        repo_name = instance['repo'].split('/')[1]
			
 
				+        zip_dest = os.path.join(
			
 
				+            metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}.zip'
			
 
				+        )
			
 
				+        patch_file = os.path.join(
			
 
				+            metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}_patch.diff'
			
 
				+        )
			
 
				+        test_output_file = os.path.join(
			
 
				+            metadata.eval_output_dir, 'repos', repo_name, f'{repo_name}_test_output.txt'
			
 
				+        )
			
 
				+        pytest_exit_code_file = os.path.join(
			
 
				+            metadata.eval_output_dir,
			
 
				+            'repos',
			
 
				+            repo_name,
			
 
				+            f'{repo_name}_pytest_exit_code.txt',
			
 
				+        )
			
 
				+
			
 
				+        os.makedirs(os.path.dirname(zip_dest), exist_ok=True)
			
 
				+        os.rename(zip_file, zip_dest)
			
 
				+
			
 
				+        write_targets = [
			
 
				+            (patch_file, git_patch),
			
 
				+            (test_output_file, test_output),
			
 
				+            (pytest_exit_code_file, pytest_exit_code),
			
 
				+        ]
			
 
				+
			
 
				+        for write_target in write_targets:
			
 
				+            with open(write_target[0], 'w') as f:
			
 
				+                f.write(write_target[1])
			
 
				+
			
 
				+        logger.info(
			
 
				+            f'Got evaluation result for repo {instance.instance_id}:\n--------\n{eval_result}\n--------'
			
 
				+        )
			
 
				+    finally:
			
 
				+        runtime.close()
			
 
				+    # ==========================================
			
 
				+
			
 
				+    # ======= Attempt to evaluate the agent's edits =======
			
 
				+    # we use eval_infer.sh to evaluate the agent's edits, not here
			
 
				+    # because the agent may alter the environment / testcases
			
 
				+    test_result = {
			
 
				+        'eval_result': eval_result,
			
 
				+    }
			
 
				+
			
 
				+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
			
 
				+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+
			
 
				+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
			
 
				+    histories = [event_to_dict(event) for event in state.history]
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = EvalOutput(
			
 
				+        instance_id=instance.instance_id,
			
 
				+        instruction=instruction,
			
 
				+        instance=instance.to_dict(),
			
 
				+        test_result=test_result,
			
 
				+        metadata=metadata,
			
 
				+        history=histories,
			
 
				+        metrics=metrics,
			
 
				+        error=state.last_error if state and state.last_error else None,
			
 
				+    )
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+def commit0_setup(dataset: pd.DataFrame, repo_split: str) -> pd.DataFrame:
			
 
				+    """Setup Commit0 dataset based on split type.
			
 
				+
			
 
				+    Args:
			
 
				+        dataset: Full Commit0 dataset
			
 
				+        repo_split: Split type ('all', 'lite' or specific repo name)
			
 
				+
			
 
				+    Returns:
			
 
				+        Filtered dataset based on split type
			
 
				+    """
			
 
				+
			
 
				+    filtered_dataset = pd.concat(
			
 
				+        [
			
 
				+            dataset[dataset['repo'].str.split('/').str[1] == repo]
			
 
				+            for repo in SPLIT.get(repo_split, [])
			
 
				+        ]
			
 
				+    )
			
 
				+
			
 
				+    # Drop setup column if it exists
			
 
				+    if 'setup' in filtered_dataset.columns:
			
 
				+        filtered_dataset = filtered_dataset.drop('setup', axis=1)
			
 
				+
			
 
				+    # Replace all forward slashes in instance_id with hyphens
			
 
				+    filtered_dataset['instance_id'] = filtered_dataset['repo'].str.split('/').str[1]
			
 
				+
			
 
				+    return filtered_dataset
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--dataset',
			
 
				+        type=str,
			
 
				+        default='wentingzhao/commit0_combined',
			
 
				+        help='dataset to evaluate on, only test split exists for this HF dataset',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--split',
			
 
				+        type=str,
			
 
				+        default='test',
			
 
				+        help='this is the HF dataset split',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--repo-split',
			
 
				+        type=str,
			
 
				+        default='lite',
			
 
				+        help='all, lite, or each repo name',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				+    # so we don't need to manage file uploading to OpenHands's repo
			
 
				+    dataset = load_dataset(args.dataset, split=args.split)
			
 
				+
			
 
				+    commit0_datasets = commit0_setup(dataset.to_pandas(), args.repo_split)
			
 
				+
			
 
				+    logger.info(f'Loaded dataset {args.dataset} with reposplit {args.repo_split}')
			
 
				+
			
 
				+    llm_config = None
			
 
				+    if args.llm_config:
			
 
				+        llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        llm_config.log_completions = True
			
 
				+
			
 
				+    if llm_config is None:
			
 
				+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
			
 
				+
			
 
				+    details = {}
			
 
				+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
			
 
				+
			
 
				+    dataset_descrption = (
			
 
				+        args.dataset.replace('/', '__') + '-' + args.repo_split.replace('/', '__')
			
 
				+    )
			
 
				+    metadata = make_metadata(
			
 
				+        llm_config,
			
 
				+        dataset_descrption,
			
 
				+        args.agent_cls,
			
 
				+        args.max_iterations,
			
 
				+        args.eval_note,
			
 
				+        args.eval_output_dir,
			
 
				+        details=details,
			
 
				+    )
			
 
				+
			
 
				+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
			
 
				+
			
 
				+    instances = prepare_dataset(commit0_datasets, output_file, args.eval_n_limit)
			
 
				+
			
 
				+    run_evaluation(
			
 
				+        instances,
			
 
				+        metadata,
			
 
				+        output_file,
			
 
				+        args.eval_num_workers,
			
 
				+        process_instance,
			
 
				+        timeout_seconds=120 * 60,  # 2 hour PER instance should be more than enough
			
 
				+    )
			
--- a/evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/commit0_bench/scripts/cleanup_remote_runtime.sh
@@ -0,0 +1,33 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+
			
 
				+# API base URL
			
 
				+BASE_URL="https://runtime.eval.all-hands.dev"
			
 
				+
			
 
				+# Get the list of runtimes
			
 
				+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
			
 
				+  --header "X-API-Key: ${ALLHANDS_API_KEY}")
			
 
				+
			
 
				+n_runtimes=$(echo $response | jq -r '.total')
			
 
				+echo "Found ${n_runtimes} runtimes. Stopping them..."
			
 
				+
			
 
				+runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
			
 
				+
			
 
				+# Function to stop a single runtime
			
 
				+stop_runtime() {
			
 
				+  local runtime_id=$1
			
 
				+  local counter=$2
			
 
				+  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
			
 
				+  curl --silent --location --request POST "${BASE_URL}/stop" \
			
 
				+    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
			
 
				+    --header "Content-Type: application/json" \
			
 
				+    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
			
 
				+  echo
			
 
				+}
			
 
				+export -f stop_runtime
			
 
				+export BASE_URL ALLHANDS_API_KEY n_runtimes
			
 
				+
			
 
				+# Use GNU Parallel to stop runtimes in parallel
			
 
				+echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}
			
 
				+
			
 
				+echo "All runtimes have been stopped."
			
--- a/evaluation/commit0_bench/scripts/run_infer.sh
+++ b/evaluation/commit0_bench/scripts/run_infer.sh
@@ -0,0 +1,125 @@
 
				+#!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				+REPO_SPLIT=$1
			
 
				+MODEL_CONFIG=$2
			
 
				+COMMIT_HASH=$3
			
 
				+AGENT=$4
			
 
				+EVAL_LIMIT=$5
			
 
				+MAX_ITER=$6
			
 
				+NUM_WORKERS=$7
			
 
				+DATASET=$8
			
 
				+SPLIT=$9
			
 
				+N_RUNS=${10}
			
 
				+
			
 
				+if [ -z "$NUM_WORKERS" ]; then
			
 
				+  NUM_WORKERS=1
			
 
				+  echo "Number of workers not specified, use default $NUM_WORKERS"
			
 
				+fi
			
 
				+checkout_eval_branch
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$MAX_ITER" ]; then
			
 
				+  echo "MAX_ITER not specified, use default 100"
			
 
				+  MAX_ITER=100
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$USE_INSTANCE_IMAGE" ]; then
			
 
				+  echo "USE_INSTANCE_IMAGE not specified, use default true"
			
 
				+  USE_INSTANCE_IMAGE=true
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$RUN_WITH_BROWSING" ]; then
			
 
				+  echo "RUN_WITH_BROWSING not specified, use default false"
			
 
				+  RUN_WITH_BROWSING=false
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+if [ -z "$DATASET" ]; then
			
 
				+  echo "DATASET not specified, use default wentingzhao/commit0_combined"
			
 
				+  DATASET="wentingzhao/commit0_combined"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$REPO_SPLIT" ]; then
			
 
				+  echo "REPO_SPLIT not specified, use default lite"
			
 
				+  REPO_SPLIT=0
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$SPLIT" ]; then
			
 
				+  echo "HF SPLIT not specified, use default test"
			
 
				+  SPLIT="test"
			
 
				+fi
			
 
				+
			
 
				+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
			
 
				+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
			
 
				+export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
			
 
				+echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
			
 
				+
			
 
				+get_agent_version
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+echo "DATASET: $DATASET"
			
 
				+echo "HF SPLIT: $SPLIT"
			
 
				+echo "REPO SPLIT: $REPO_SPLIT"
			
 
				+
			
 
				+# Default to NOT use Hint
			
 
				+if [ -z "$USE_HINT_TEXT" ]; then
			
 
				+  export USE_HINT_TEXT=false
			
 
				+fi
			
 
				+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
			
 
				+EVAL_NOTE="$AGENT_VERSION"
			
 
				+# if not using Hint, add -no-hint to the eval note
			
 
				+if [ "$USE_HINT_TEXT" = false ]; then
			
 
				+  EVAL_NOTE="$EVAL_NOTE-no-hint"
			
 
				+fi
			
 
				+
			
 
				+if [ "$RUN_WITH_BROWSING" = true ]; then
			
 
				+  EVAL_NOTE="$EVAL_NOTE-with-browsing"
			
 
				+fi
			
 
				+
			
 
				+if [ -n "$EXP_NAME" ]; then
			
 
				+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
			
 
				+fi
			
 
				+
			
 
				+function run_eval() {
			
 
				+  local eval_note=$1
			
 
				+  COMMAND="poetry run python evaluation/commit0_bench/run_infer.py \
			
 
				+    --agent-cls $AGENT \
			
 
				+    --llm-config $MODEL_CONFIG \
			
 
				+    --max-iterations $MAX_ITER \
			
 
				+    --eval-num-workers $NUM_WORKERS \
			
 
				+    --eval-note $eval_note \
			
 
				+    --dataset $DATASET \
			
 
				+    --split $SPLIT \
			
 
				+    --repo-split $REPO_SPLIT"
			
 
				+
			
 
				+  if [ -n "$EVAL_LIMIT" ]; then
			
 
				+    echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+  fi
			
 
				+
			
 
				+  # Run the command
			
 
				+  eval $COMMAND
			
 
				+}
			
 
				+
			
 
				+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
			
 
				+if [ -z "$N_RUNS" ]; then
			
 
				+  N_RUNS=1
			
 
				+  echo "N_RUNS not specified, use default $N_RUNS"
			
 
				+fi
			
 
				+
			
 
				+for i in $(seq 1 $N_RUNS); do
			
 
				+  current_eval_note="$EVAL_NOTE-run_$i"
			
 
				+  echo "EVAL_NOTE: $current_eval_note"
			
 
				+  run_eval $current_eval_note
			
 
				+done
			
 
				+
			
 
				+checkout_original_branch
			
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,7 +60,7 @@ whatthepatch = "^1.0.6"
 
				 protobuf = "^4.21.6,<5.0.0" # chromadb currently fails on 5.0+
			
 
				 opentelemetry-api = "1.25.0"
			
 
				 opentelemetry-exporter-otlp-proto-grpc = "1.25.0"
			
 
				-modal = "^0.64.145"
			
 
				+modal = "^0.66.26"
			
 
				 runloop-api-client = "0.7.0"
			
 
				 pygithub = "^2.5.0"
			
 
				 openhands-aci = "^0.1.1"
			
@@ -95,6 +95,7 @@ reportlab = "*"
 
				 [tool.coverage.run]
			
 
				 concurrency = ["gevent"]
			
 
				 
			
 
				+
			
 
				 [tool.poetry.group.runtime.dependencies]
			
 
				 jupyterlab = "*"
			
 
				 notebook = "*"
			
@@ -125,12 +126,14 @@ ignore = ["D1"]
 
				 [tool.ruff.lint.pydocstyle]
			
 
				 convention = "google"
			
 
				 
			
 
				+
			
 
				 [tool.poetry.group.evaluation.dependencies]
			
 
				 streamlit = "*"
			
 
				 whatthepatch = "*"
			
 
				 retry = "*"
			
 
				 evaluate = "*"
			
 
				 swebench = { git = "https://github.com/All-Hands-AI/SWE-bench.git" }
			
 
				+commit0 = "*"
			
 
				 func_timeout = "*"
			
 
				 sympy = "*"
			
 
				 gdown = "*"