1 год назад · f45a2ff04e
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -0,0 +1,51 @@
 
															+# Browsing Delegation Evalution
														
 
															+
														
 
															+Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
														
 
															+
														
 
															+This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
														
 
															+If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
														
 
															+
														
 
															+
														
 
															+## Setup Environment
														
 
															+
														
 
															+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
														
 
															+
														
 
															+## Configure OpenDevin and your LLM
														
 
															+
														
 
															+Create a `config.toml` file if it does not exist at the root of the workspace.
														
 
															+
														
 
															+Add the following configurations:
														
 
															+
														
 
															+```toml
														
 
															+# TODO: Change these to the model you want to evaluate
														
 
															+[llm.eval_gpt4_1106_preview_llm]
														
 
															+model = "gpt-4-1106-preview"
														
 
															+api_key = "XXX"
														
 
															+temperature = 0.0
														
 
															+
														
 
															+[llm.eval_some_openai_compatible_model_llm]
														
 
															+model = "openai/MODEL_NAME"
														
 
															+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
														
 
															+api_key = "XXX"
														
 
															+temperature = 0.0
														
 
															+```
														
 
															+
														
 
															+## Run Inference
														
 
															+
														
 
															+```bash
														
 
															+./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
														
 
															+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
														
 
															+```
														
 
															+
														
 
															+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
														
 
															+
														
 
															+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
														
 
															+LLM settings, as defined in your `config.toml`.
														
 
															+
														
 
															+`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
														
 
															+like to evaluate. It could also be a release tag like `0.6.2`.
														
 
															+
														
 
															+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
														
 
															+to `CodeActAgent`.
														
 
															+
														
 
															+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
														
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -0,0 +1,164 @@
 
															+import asyncio
														
 
															+import logging
														
 
															+import os
														
 
															+import re
														
 
															+
														
 
															+import nltk
														
 
															+import pandas as pd
														
 
															+from datasets import load_dataset
														
 
															+
														
 
															+from evaluation.utils.shared import (
														
 
															+    EvalMetadata,
														
 
															+    make_metadata,
														
 
															+    prepare_dataset,
														
 
															+    run_evaluation,
														
 
															+)
														
 
															+from opendevin.controller.agent import Agent
														
 
															+from opendevin.controller.state.state import State
														
 
															+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
														
 
															+from opendevin.core.logger import get_console_handler
														
 
															+from opendevin.core.logger import opendevin_logger as logger
														
 
															+from opendevin.core.main import run_agent_controller
														
 
															+from opendevin.llm.llm import LLM
														
 
															+
														
 
															+# Only CodeActAgent can delegate to BrowsingAgent
														
 
															+SUPPORTED_AGENT_CLS = {'CodeActAgent'}
														
 
															+
														
 
															+
														
 
															+def process_instance(
														
 
															+    instance: pd.Series,
														
 
															+    metadata: EvalMetadata,
														
 
															+    reset_logger: bool = True,
														
 
															+):
														
 
															+    # Create the agent
														
 
															+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
														
 
															+    env_id = instance.instance_id
														
 
															+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
														
 
															+    if reset_logger:
														
 
															+        # Set up logger
														
 
															+        log_file = os.path.join(
														
 
															+            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
														
 
															+        )
														
 
															+        # Remove all existing handlers from logger
														
 
															+        for handler in logger.handlers[:]:
														
 
															+            logger.removeHandler(handler)
														
 
															+        # add back the console handler to print ONE line
														
 
															+        logger.addHandler(get_console_handler())
														
 
															+        logger.info(
														
 
															+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
														
 
															+        )
														
 
															+        # Remove all existing handlers from logger
														
 
															+        for handler in logger.handlers[:]:
														
 
															+            logger.removeHandler(handler)
														
 
															+        file_handler = logging.FileHandler(log_file)
														
 
															+        file_handler.setFormatter(
														
 
															+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
														
 
															+        )
														
 
															+        logger.addHandler(file_handler)
														
 
															+    else:
														
 
															+        logger.info(f'Starting evaluation for instance {env_id}.')
														
 
															+
														
 
															+    instruction = (
														
 
															+        f'You can delegate browsing tasks to a browser agent. '
														
 
															+        f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
														
 
															+        f'Now, solve the following query: "{instance.instruction}"\n'
														
 
															+        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
														
 
															+    )
														
 
															+
														
 
															+    state: State | None = asyncio.run(
														
 
															+        run_agent_controller(
														
 
															+            agent,
														
 
															+            instruction,
														
 
															+            max_iterations=metadata.max_iterations,
														
 
															+            sid=env_id,
														
 
															+        )
														
 
															+    )
														
 
															+
														
 
															+    # ======= Attempt to evaluate the agent's environment impact =======
														
 
															+
														
 
															+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
														
 
															+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
														
 
															+
														
 
															+    if state is None:
														
 
															+        raise ValueError('State should not be None.')
														
 
															+
														
 
															+    metrics = state.metrics.get() if state.metrics else None
														
 
															+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
														
 
															+    # for compatibility with the existing output format, we can remake the pairs here
														
 
															+    # remove when it becomes unnecessary
														
 
															+    histories = state.history.compatibility_for_eval_history_pairs()
														
 
															+
														
 
															+    # find the last delegate action
														
 
															+    last_delegate_action = None
														
 
															+    result = {}
														
 
															+    for action, _ in histories:
														
 
															+        if action['action'] == 'delegate':
														
 
															+            last_delegate_action = action
														
 
															+            instruction_for_delegate = action['args']['inputs']['task']
														
 
															+            # parse `browse_actions` from `instruction_for_delegate`
														
 
															+            # task = f'{thought}. I should start with: {browse_actions}'
														
 
															+            instruction_for_delegate = re.search(
														
 
															+                r'I should start with: (.*)', instruction_for_delegate
														
 
															+            ).group(1)
														
 
															+
														
 
															+            # calculate the edit distance between the instance.instruction and the instruction_for_delegate
														
 
															+            edit_distance = nltk.edit_distance(
														
 
															+                instance.instruction, instruction_for_delegate
														
 
															+            )
														
 
															+            is_exact_match = (
														
 
															+                instance.instruction.strip() == instruction_for_delegate.strip()
														
 
															+            )
														
 
															+            result['edit_distance'] = edit_distance
														
 
															+            result['is_exact_match'] = is_exact_match
														
 
															+
														
 
															+    # Save the output
														
 
															+    output = {
														
 
															+        'instance_id': env_id,
														
 
															+        'instruction': instruction,
														
 
															+        'metadata': metadata.model_dump(),
														
 
															+        'history': histories,
														
 
															+        'metrics': metrics,
														
 
															+        'error': state.last_error if state and state.last_error else None,
														
 
															+        'test_result': {
														
 
															+            'query': instance.instruction,
														
 
															+            'action': last_delegate_action,
														
 
															+            'result': result,
														
 
															+        },
														
 
															+    }
														
 
															+
														
 
															+    return output
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    args = parse_arguments()
														
 
															+
														
 
															+    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
														
 
															+    dataset = dataset['train'].to_pandas()
														
 
															+    assert dataset.columns.tolist() == ['instance_id', 'instruction']
														
 
															+    id_column = 'instance_id'
														
 
															+    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
														
 
															+    logger.info(f'Config for evaluation: {config}')
														
 
															+
														
 
															+    metadata = make_metadata(
														
 
															+        llm_config,
														
 
															+        'browsing_delegation',
														
 
															+        args.agent_cls,
														
 
															+        args.max_iterations,
														
 
															+        args.eval_note,
														
 
															+        args.eval_output_dir,
														
 
															+    )
														
 
															+    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
														
 
															+        raise ValueError(
														
 
															+            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
														
 
															+        )
														
 
															+
														
 
															+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
														
 
															+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
														
 
															+    run_evaluation(
														
 
															+        instances,
														
 
															+        metadata,
														
 
															+        output_file,
														
 
															+        args.eval_num_workers,
														
 
															+        process_instance,
														
 
															+        id_column,
														
 
															+    )
														
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -0,0 +1,45 @@
 
															+#!/bin/bash
														
 
															+set -eo pipefail
														
 
															+
														
 
															+source "evaluation/utils/version_control.sh"
														
 
															+
														
 
															+MODEL_CONFIG=$1
														
 
															+COMMIT_HASH=$2
														
 
															+AGENT=$3
														
 
															+EVAL_LIMIT=$4
														
 
															+NUM_WORKERS=$5
														
 
															+
														
 
															+if [ -z "$NUM_WORKERS" ]; then
														
 
															+  NUM_WORKERS=1
														
 
															+  echo "Number of workers not specified, use default $NUM_WORKERS"
														
 
															+fi
														
 
															+checkout_eval_branch
														
 
															+
														
 
															+if [ -z "$AGENT" ]; then
														
 
															+  echo "Agent not specified, use default CodeActAgent"
														
 
															+  AGENT="CodeActAgent"
														
 
															+fi
														
 
															+
														
 
															+get_agent_version
														
 
															+
														
 
															+echo "AGENT: $AGENT"
														
 
															+echo "AGENT_VERSION: $AGENT_VERSION"
														
 
															+echo "MODEL_CONFIG: $MODEL_CONFIG"
														
 
															+
														
 
															+EVAL_NOTE="$AGENT_VERSION"
														
 
															+
														
 
															+COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
														
 
															+  --agent-cls $AGENT \
														
 
															+  --llm-config $MODEL_CONFIG \
														
 
															+  --max-iterations 1 \
														
 
															+  --max-chars 10000000 \
														
 
															+  --eval-num-workers $NUM_WORKERS \
														
 
															+  --eval-note $EVAL_NOTE"
														
 
															+
														
 
															+if [ -n "$EVAL_LIMIT" ]; then
														
 
															+  echo "EVAL_LIMIT: $EVAL_LIMIT"
														
 
															+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
														
 
															+fi
														
 
															+
														
 
															+# Run the command
														
 
															+eval $COMMAND
														
--- a/opendevin/events/stream.py
+++ b/opendevin/events/stream.py
@@ -1,11 +1,11 @@
 
															 import asyncio
														
 
															-import json
														
 
															 import threading
														
 
															 from datetime import datetime
														
 
															 from enum import Enum
														
 
															 from typing import Callable, Iterable
														
 
															 from opendevin.core.logger import opendevin_logger as logger
														
 
															+from opendevin.core.utils import json
														
 
															 from opendevin.events.serialization.event import event_from_dict, event_to_dict
														
 
															 from opendevin.storage import FileStore, get_file_store