Просмотр исходного кода

[Agent, Eval] Fixes LLM config issue for delegation & Add eval to measure the delegation accuracy (#2948)

* fix json import

* pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose

* add inference script for browser delegation

* add readme

* Update agenthub/codeact_agent/action_parser.py

Co-authored-by: Graham Neubig <neubig@gmail.com>

* revert action parser changes.

* Rework --llm-config CLI arg

* Revert "pass llm to delegation action so that sub-agent shares the same llm for cost accum purpose"

This reverts commit 81034c486e6fa5733f1fdb6c5222851d72dd0d0d.

* remove view summary

* update readme

* update comment

* update readme

---------

Co-authored-by: Graham Neubig <neubig@gmail.com>
Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
Xingyao Wang 1 год назад
Родитель
Сommit
f45a2ff04e

+ 51 - 0
evaluation/browsing_delegation/README.md

@@ -0,0 +1,51 @@
+# Browsing Delegation Evalution
+
+Some of OpenDevin's agent supports agent delegation action, for example, CodeActAgent can delegate browsing tasks to BrowsingAgent.
+
+This evaluation tests whether CodeActAgent can correctly delegate the instruction from WebArena and MiniWob benchmark to the BrowsingAgent.
+If so, the browsing performance upper-bound of CodeActAgent will be the performance of BrowsingAgent.
+
+
+## Setup Environment
+
+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to set up a local development environment for OpenDevin.
+
+## Configure OpenDevin and your LLM
+
+Create a `config.toml` file if it does not exist at the root of the workspace.
+
+Add the following configurations:
+
+```toml
+# TODO: Change these to the model you want to evaluate
+[llm.eval_gpt4_1106_preview_llm]
+model = "gpt-4-1106-preview"
+api_key = "XXX"
+temperature = 0.0
+
+[llm.eval_some_openai_compatible_model_llm]
+model = "openai/MODEL_NAME"
+base_url = "https://OPENAI_COMPATIBLE_URL/v1"
+api_key = "XXX"
+temperature = 0.0
+```
+
+## Run Inference
+
+```bash
+./evaluation/browsing_delegation/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
+```
+
+where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
+
+`model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
+
+`git-version`, e.g. `HEAD`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
+to `CodeActAgent`.
+
+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.

+ 164 - 0
evaluation/browsing_delegation/run_infer.py

@@ -0,0 +1,164 @@
+import asyncio
+import logging
+import os
+import re
+
+import nltk
+import pandas as pd
+from datasets import load_dataset
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    make_metadata,
+    prepare_dataset,
+    run_evaluation,
+)
+from opendevin.controller.agent import Agent
+from opendevin.controller.state.state import State
+from opendevin.core.config import config, get_llm_config_arg, parse_arguments
+from opendevin.core.logger import get_console_handler
+from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.main import run_agent_controller
+from opendevin.llm.llm import LLM
+
+# Only CodeActAgent can delegate to BrowsingAgent
+SUPPORTED_AGENT_CLS = {'CodeActAgent'}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+):
+    # Create the agent
+    agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
+    env_id = instance.instance_id
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        # Set up logger
+        log_file = os.path.join(
+            metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        # add back the console handler to print ONE line
+        logger.addHandler(get_console_handler())
+        logger.info(
+            f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
+        )
+        # Remove all existing handlers from logger
+        for handler in logger.handlers[:]:
+            logger.removeHandler(handler)
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(
+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+        )
+        logger.addHandler(file_handler)
+    else:
+        logger.info(f'Starting evaluation for instance {env_id}.')
+
+    instruction = (
+        f'You can delegate browsing tasks to a browser agent. '
+        f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via <execute_browse> Who is the president of the United States? </execute_browse>.\n"
+        f'Now, solve the following query: "{instance.instruction}"\n'
+        f'NOTE: You should copy the "query" as is into the <execute_browse> tag. DO NOT change ANYTHING in the query.'
+    )
+
+    state: State | None = asyncio.run(
+        run_agent_controller(
+            agent,
+            instruction,
+            max_iterations=metadata.max_iterations,
+            sid=env_id,
+        )
+    )
+
+    # ======= Attempt to evaluate the agent's environment impact =======
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    metrics = state.metrics.get() if state.metrics else None
+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
+    # for compatibility with the existing output format, we can remake the pairs here
+    # remove when it becomes unnecessary
+    histories = state.history.compatibility_for_eval_history_pairs()
+
+    # find the last delegate action
+    last_delegate_action = None
+    result = {}
+    for action, _ in histories:
+        if action['action'] == 'delegate':
+            last_delegate_action = action
+            instruction_for_delegate = action['args']['inputs']['task']
+            # parse `browse_actions` from `instruction_for_delegate`
+            # task = f'{thought}. I should start with: {browse_actions}'
+            instruction_for_delegate = re.search(
+                r'I should start with: (.*)', instruction_for_delegate
+            ).group(1)
+
+            # calculate the edit distance between the instance.instruction and the instruction_for_delegate
+            edit_distance = nltk.edit_distance(
+                instance.instruction, instruction_for_delegate
+            )
+            is_exact_match = (
+                instance.instruction.strip() == instruction_for_delegate.strip()
+            )
+            result['edit_distance'] = edit_distance
+            result['is_exact_match'] = is_exact_match
+
+    # Save the output
+    output = {
+        'instance_id': env_id,
+        'instruction': instruction,
+        'metadata': metadata.model_dump(),
+        'history': histories,
+        'metrics': metrics,
+        'error': state.last_error if state and state.last_error else None,
+        'test_result': {
+            'query': instance.instruction,
+            'action': last_delegate_action,
+            'result': result,
+        },
+    }
+
+    return output
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+
+    dataset = load_dataset('OpenDevin/eval-browsing-instructions')
+    dataset = dataset['train'].to_pandas()
+    assert dataset.columns.tolist() == ['instance_id', 'instruction']
+    id_column = 'instance_id'
+    llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
+    logger.info(f'Config for evaluation: {config}')
+
+    metadata = make_metadata(
+        llm_config,
+        'browsing_delegation',
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+    )
+    if metadata.agent_class not in SUPPORTED_AGENT_CLS:
+        raise ValueError(
+            f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
+        )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
+    run_evaluation(
+        instances,
+        metadata,
+        output_file,
+        args.eval_num_workers,
+        process_instance,
+        id_column,
+    )

+ 45 - 0
evaluation/browsing_delegation/scripts/run_infer.sh

@@ -0,0 +1,45 @@
+#!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+NUM_WORKERS=$5
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+get_agent_version
+
+echo "AGENT: $AGENT"
+echo "AGENT_VERSION: $AGENT_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+
+EVAL_NOTE="$AGENT_VERSION"
+
+COMMAND="poetry run python evaluation/browsing_delegation/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 1 \
+  --max-chars 10000000 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND

+ 1 - 1
opendevin/events/stream.py

@@ -1,11 +1,11 @@
 import asyncio
 import asyncio
-import json
 import threading
 import threading
 from datetime import datetime
 from datetime import datetime
 from enum import Enum
 from enum import Enum
 from typing import Callable, Iterable
 from typing import Callable, Iterable
 
 
 from opendevin.core.logger import opendevin_logger as logger
 from opendevin.core.logger import opendevin_logger as logger
+from opendevin.core.utils import json
 from opendevin.events.serialization.event import event_from_dict, event_to_dict
 from opendevin.events.serialization.event import event_from_dict, event_to_dict
 from opendevin.storage import FileStore, get_file_store
 from opendevin.storage import FileStore, get_file_store