пре 1 година · f45a2ff04e
--- a/evaluation/browsing_delegation/README.md
+++ b/evaluation/browsing_delegation/README.md
@@ -0,0 +1,51 @@
 
				+# Browsing Delegation Evalution
			
 
				+
			
 
				+Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
			
 
				+
			
 
				+This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
			
 
				+If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
			
 
				+
			
 
				+
			
 
				+## Setup Environment
			
 
				+
			
 
				+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
			
 
				+
			
 
				+## Configure OpenDevin and your LLM
			
 
				+
			
 
				+Create a `config.toml` file if it does not exist at the root of the workspace.
			
 
				+
			
 
				+Add the following configurations:
			
 
				+
			
 
				+```toml
			
 
				+# TODO: Change these to the model you want to evaluate
			
 
				+[llm.eval_gpt4_1106_preview_llm]
			
 
				+model = "gpt-4-1106-preview"
			
 
				+api_key = "XXX"
			
 
				+temperature = 0.0
			
 
				+
			
 
				+[llm.eval_some_openai_compatible_model_llm]
			
 
				+model = "openai/MODEL_NAME"
			
 
				+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
			
 
				+api_key = "XXX"
			
 
				+temperature = 0.0
			
 
				+```
			
 
				+
			
 
				+## Run Inference
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
			
 
				+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
			
 
				+
			
 
				+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+
			
 
				+`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+
			
 
				+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
			
--- a/evaluation/browsing_delegation/run_infer.py
+++ b/evaluation/browsing_delegation/run_infer.py
@@ -0,0 +1,164 @@
 
				+import asyncio
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+
			
 
				+import nltk
			
 
				+import pandas as pd
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				+from evaluation.utils.shared import (
			
 
				+    EvalMetadata,
			
 
				+    make_metadata,
			
 
				+    prepare_dataset,
			
 
				+    run_evaluation,
			
 
				+)
			
 
				+from opendevin.controller.agent import Agent
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
			
 
				+from opendevin.core.logger import get_console_handler
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.main import run_agent_controller
			
 
				+from opendevin.llm.llm import LLM
			
 
				+
			
 
				+# Only CodeActAgent can delegate to BrowsingAgent
			
 
				+SUPPORTED_AGENT_CLS = {'CodeActAgent'}
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance: pd.Series,
			
 
				+    metadata: EvalMetadata,
			
 
				+    reset_logger: bool = True,
			
 
				+):
			
 
				+    # Create the agent
			
 
				+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
			
 
				+    env_id = instance.instance_id
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
			
 
				+    if reset_logger:
			
 
				+        # Set up logger
			
 
				+        log_file = os.path.join(
			
 
				+            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        # add back the console handler to print ONE line
			
 
				+        logger.addHandler(get_console_handler())
			
 
				+        logger.info(
			
 
				+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(
			
 
				+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        )
			
 
				+        logger.addHandler(file_handler)
			
 
				+    else:
			
 
				+        logger.info(f'Starting evaluation for instance {env_id}.')
			
 
				+
			
 
				+    instruction = (
			
 
				+        f'You can delegate browsing tasks to a browser agent. '
			
 
				+        f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
			
 
				+        f'Now, solve the following query: "{instance.instruction}"\n'
			
 
				+        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
			
 
				+    )
			
 
				+
			
 
				+    state: State | None = asyncio.run(
			
 
				+        run_agent_controller(
			
 
				+            agent,
			
 
				+            instruction,
			
 
				+            max_iterations=metadata.max_iterations,
			
 
				+            sid=env_id,
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				+    # ======= Attempt to evaluate the agent's environment impact =======
			
 
				+
			
 
				+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
			
 
				+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				+
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
			
 
				+    # for compatibility with the existing output format, we can remake the pairs here
			
 
				+    # remove when it becomes unnecessary
			
 
				+    histories = state.history.compatibility_for_eval_history_pairs()
			
 
				+
			
 
				+    # find the last delegate action
			
 
				+    last_delegate_action = None
			
 
				+    result = {}
			
 
				+    for action, _ in histories:
			
 
				+        if action['action'] == 'delegate':
			
 
				+            last_delegate_action = action
			
 
				+            instruction_for_delegate = action['args']['inputs']['task']
			
 
				+            # parse `browse_actions` from `instruction_for_delegate`
			
 
				+            # task = f'{thought}. I should start with: {browse_actions}'
			
 
				+            instruction_for_delegate = re.search(
			
 
				+                r'I should start with: (.*)', instruction_for_delegate
			
 
				+            ).group(1)
			
 
				+
			
 
				+            # calculate the edit distance between the instance.instruction and the instruction_for_delegate
			
 
				+            edit_distance = nltk.edit_distance(
			
 
				+                instance.instruction, instruction_for_delegate
			
 
				+            )
			
 
				+            is_exact_match = (
			
 
				+                instance.instruction.strip() == instruction_for_delegate.strip()
			
 
				+            )
			
 
				+            result['edit_distance'] = edit_distance
			
 
				+            result['is_exact_match'] = is_exact_match
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = {
			
 
				+        'instance_id': env_id,
			
 
				+        'instruction': instruction,
			
 
				+        'metadata': metadata.model_dump(),
			
 
				+        'history': histories,
			
 
				+        'metrics': metrics,
			
 
				+        'error': state.last_error if state and state.last_error else None,
			
 
				+        'test_result': {
			
 
				+            'query': instance.instruction,
			
 
				+            'action': last_delegate_action,
			
 
				+            'result': result,
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    args = parse_arguments()
			
 
				+
			
 
				+    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
			
 
				+    dataset = dataset['train'].to_pandas()
			
 
				+    assert dataset.columns.tolist() == ['instance_id', 'instruction']
			
 
				+    id_column = 'instance_id'
			
 
				+    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+
			
 
				+    metadata = make_metadata(
			
 
				+        llm_config,
			
 
				+        'browsing_delegation',
			
 
				+        args.agent_cls,
			
 
				+        args.max_iterations,
			
 
				+        args.eval_note,
			
 
				+        args.eval_output_dir,
			
 
				+    )
			
 
				+    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
			
 
				+        raise ValueError(
			
 
				+            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
			
 
				+        )
			
 
				+
			
 
				+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
			
 
				+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
			
 
				+    run_evaluation(
			
 
				+        instances,
			
 
				+        metadata,
			
 
				+        output_file,
			
 
				+        args.eval_num_workers,
			
 
				+        process_instance,
			
 
				+        id_column,
			
 
				+    )
			
--- a/evaluation/browsing_delegation/scripts/run_infer.sh
+++ b/evaluation/browsing_delegation/scripts/run_infer.sh
@@ -0,0 +1,45 @@
 
				+#!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				+MODEL_CONFIG=$1
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+NUM_WORKERS=$5
			
 
				+
			
 
				+if [ -z "$NUM_WORKERS" ]; then
			
 
				+  NUM_WORKERS=1
			
 
				+  echo "Number of workers not specified, use default $NUM_WORKERS"
			
 
				+fi
			
 
				+checkout_eval_branch
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+get_agent_version
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+
			
 
				+EVAL_NOTE="$AGENT_VERSION"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
			
 
				+  --agent-cls $AGENT \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --max-iterations 1 \
			
 
				+  --max-chars 10000000 \
			
 
				+  --eval-num-workers $NUM_WORKERS \
			
 
				+  --eval-note $EVAL_NOTE"
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/opendevin/events/stream.py
+++ b/opendevin/events/stream.py
@@ -1,11 +1,11 @@
 
				 import asyncio
			
 
				-import json
			
 
				 import threading
			
 
				 from datetime import datetime
			
 
				 from enum import Enum
			
 
				 from typing import Callable, Iterable
			
 
				 
			
 
				 from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.utils import json
			
 
				 from opendevin.events.serialization.event import event_from_dict, event_to_dict
			
 
				 from opendevin.storage import FileStore, get_file_store