ai
/
OpenHands


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
							"""Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
Language Models for Code Generation in Repository-Level Machine Learning Tasks"
(https://arxiv.org/abs/2311.09835).

Please see https://ghcr.io/super-dainiu/ml_bench and https://huggingface.co/datasets/super-dainiu/ml-bench
for more details on the dataset and docker image used in this evaluation script.

TODOs:
- Support additional evaluation settings, such as providing raw README content or using a
  retriever to extract relevant segments.
- Clean up the code and docker image used for evaluation.
"""

import asyncio
import logging
import os
import pathlib
from typing import Any

from datasets import load_dataset

from evaluation.utils.shared import (
    EvalMetadata,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
    run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
from opendevin.runtime.docker.ssh_box import DockerSSHBox

config = load_app_config()

AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
    'CodeActAgent': codeact_user_response,
}

AGENT_CLS_TO_INST_SUFFIX = {
    'CodeActAgent': 'When you think you have completed the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
}

ID2CONDA = {
    1: 'dgl_DS',
    2: 'bert_DS',
    3: 'lavis_DS',
    4: 'if_DS',
    5: 'V2V_DS',
    6: 'esm_DS',
    7: 'OP_DS',
    8: 'TSL_DS',
    9: 'EAP_DS',
    10: 'PG_DS',
    11: 'PIM_DS',
    12: 'AD2_DS',
    13: 'L3_DS',
    14: 'MZ2_DS',
    15: 'GSA2_DS',
}


def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
    old_workspace_mount_path = config.workspace_mount_path
    old_workspace_base = config.workspace_base
    try:
        workspace_mount_path = os.path.join(
            config.workspace_mount_path, '_eval_workspace'
        )
        # create process-specific workspace dir
        # so that different agent don't interfere with each other.
        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)

        # reset workspace to config
        config.workspace_base = workspace_mount_path
        config.workspace_mount_path = workspace_mount_path

        # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
        if reset_logger:
            # Set up logger
            log_file = os.path.join(
                metadata.eval_output_dir,
                'logs',
                f"instance_{instance['id']}_pid_{os.getpid()}.log",
            )
            # Remove all existing handlers from logger
            for handler in logger.handlers[:]:
                logger.removeHandler(handler)
            # add back the console handler to print ONE line
            logger.addHandler(get_console_handler())
            logger.info(
                f"Starting evaluation for instance {instance['id']}.\nLOG:   tail -f {log_file}"
            )
            # Remove all existing handlers from logger
            for handler in logger.handlers[:]:
                logger.removeHandler(handler)
            file_handler = logging.FileHandler(log_file)
            file_handler.setFormatter(
                logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
            )
            logger.addHandler(file_handler)

        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')

        # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
        sid = str(instance['id']) + '_' + str(os.getpid())
        sandbox = DockerSSHBox(
            config=config.sandbox,
            persist_sandbox=False,
            workspace_mount_path=config.workspace_mount_path,
            sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
            cache_dir=config.cache_dir,
            run_as_devin=config.run_as_devin,
            sid=sid,
        )

        # Set up the task environment
        sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')

        # Clone the task repo into the sandbox
        repo_url = instance['github']
        repo_name = repo_url.split('/')[-1]
        sandbox.execute(f'git clone {repo_url} /workspace/{repo_name}')
        sandbox.execute(f'chmod -R 777 /workspace/{repo_name}')

        # Navigate to the task's code path
        task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
        sandbox.execute(f'cd {task_path}')

        # Prepare the task instruction
        instruction = (
            f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
            f'The task is: {instance["task"]}\n\n'
            f'{instance["instruction"]}\n\n'
            'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
            f'You can find the task repo at: {task_path}\n\n'
            + (
                'Here is the prefix code for the task:\n'
                '```bash\n'
                f'{instance["prefix_code"]}\n'
                '```\n\n'
                if instance['prefix_code']
                else ''
            )
            + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
        )
        instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]

        # Run the agent
        state: State | None = asyncio.run(
            run_agent_controller(
                agent,
                instruction,
                max_iterations=metadata.max_iterations,
                max_budget_per_task=config.max_budget_per_task,
                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                    agent.__class__.__name__
                ),
                sandbox=sandbox,
                sid=sid,
            )
        )
        assert state is not None
        metrics = state.metrics.get() if state.metrics else {}

        # Evaluate the agent's script
        eval_script = os.path.join(task_path, 'run.sh')
        logger.info(f'Running evaluation script: {eval_script}')

        try:
            _, eval_script_content = sandbox.execute(f'cat {eval_script}')
        except Exception as e:
            logger.error(f'Error reading evaluation script: {e}')
            eval_script_content = ''

        try:
            exit_code, eval_output = sandbox.execute(
                f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
                timeout=600,
            )
        except Exception as e:
            logger.error(f'Error running evaluation script: {e}')
            exit_code = -1
            eval_output = ''

        if exit_code != 0 and exit_code != 124:
            logger.warning(f'Evaluation script failed with exit code {exit_code}')
            logger.warning(f'Output: {eval_output}')
            metrics['success'] = int(
                'KeyboardInterrupt' in eval_output
            )  # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
        else:
            logger.info(f'Evaluation script succeeded with exit code {exit_code}')
            logger.info(f'Output: {eval_output}')
            metrics['success'] = 1

        # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
        # for compatibility with the existing output format, we can remake the pairs here
        # remove when it becomes unnecessary
        histories = state.history.compatibility_for_eval_history_pairs()

        # Save the output
        output = {
            'instance_id': instance['id'],
            'repo': repo_url,
            'instruction': instruction,
            'metadata': metadata.model_dump(),
            'history': histories,
            'eval_script': eval_script_content,
            'eval_exit_code': exit_code,
            'eval_output': eval_output,
            'metrics': metrics,
        }

    except Exception as e:
        logger.error(f'Error processing instance {instance["id"]}: {e}')
        raise
    finally:
        config.workspace_mount_path = old_workspace_mount_path
        config.workspace_base = old_workspace_base

    # Shutdown the sandbox
    sandbox.close()
    return output


if __name__ == '__main__':
    parser = get_parser()
    parser.add_argument(
        '-s',
        '--eval-split',
        type=str,
        default='quarter',
        choices=['full', 'quarter'],
        help='data split to evaluate on, either full or quarter',
    )
    args, _ = parser.parse_known_args()

    data_split = args.eval_split

    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
    # so we don't need to manage file uploading to OpenDevin's repo
    ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()

    id_column = 'instance_id'
    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
    logger.info(f'Config for evaluation: {config}')

    metadata = make_metadata(
        llm_config,
        args.dataset_name,
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        args.eval_output_dir,
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)

    run_evaluation(
        instances,
        metadata,
        output_file,
        args.eval_num_workers,
        process_instance,
        id_column,
    )