import asyncio
import logging
import os
import re
import nltk
import pandas as pd
from datasets import load_dataset
from evaluation.utils.shared import (
EvalMetadata,
make_metadata,
prepare_dataset,
run_evaluation,
)
from opendevin.controller.agent import Agent
from opendevin.controller.state.state import State
from opendevin.core.config import config, get_llm_config_arg, parse_arguments
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import run_agent_controller
from opendevin.llm.llm import LLM
# Only CodeActAgent can delegate to BrowsingAgent
SUPPORTED_AGENT_CLS = {'CodeActAgent'}
def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
):
# Create the agent
agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
env_id = instance.instance_id
# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
if reset_logger:
# Set up logger
log_file = os.path.join(
metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
# add back the console handler to print ONE line
logger.addHandler(get_console_handler())
logger.info(
f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
)
# Remove all existing handlers from logger
for handler in logger.handlers[:]:
logger.removeHandler(handler)
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)
else:
logger.info(f'Starting evaluation for instance {env_id}.')
instruction = (
f'You can delegate browsing tasks to a browser agent. '
f"For example, for query 'Who is the president of the United States?', you can delegate the task to a browser agent via Who is the president of the United States? .\n"
f'Now, solve the following query: "{instance.instruction}"\n'
f'NOTE: You should copy the "query" as is into the tag. DO NOT change ANYTHING in the query.'
)
state: State | None = asyncio.run(
run_agent_controller(
agent,
instruction,
max_iterations=metadata.max_iterations,
sid=env_id,
)
)
# ======= Attempt to evaluate the agent's environment impact =======
# If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
# You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
if state is None:
raise ValueError('State should not be None.')
metrics = state.metrics.get() if state.metrics else None
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
# find the last delegate action
last_delegate_action = None
result = {}
for action, _ in histories:
if action['action'] == 'delegate':
last_delegate_action = action
instruction_for_delegate = action['args']['inputs']['task']
# parse `browse_actions` from `instruction_for_delegate`
# task = f'{thought}. I should start with: {browse_actions}'
instruction_for_delegate = re.search(
r'I should start with: (.*)', instruction_for_delegate
).group(1)
# calculate the edit distance between the instance.instruction and the instruction_for_delegate
edit_distance = nltk.edit_distance(
instance.instruction, instruction_for_delegate
)
is_exact_match = (
instance.instruction.strip() == instruction_for_delegate.strip()
)
result['edit_distance'] = edit_distance
result['is_exact_match'] = is_exact_match
# Save the output
output = {
'instance_id': env_id,
'instruction': instruction,
'metadata': metadata.model_dump(),
'history': histories,
'metrics': metrics,
'error': state.last_error if state and state.last_error else None,
'test_result': {
'query': instance.instruction,
'action': last_delegate_action,
'result': result,
},
}
return output
if __name__ == '__main__':
args = parse_arguments()
dataset = load_dataset('OpenDevin/eval-browsing-instructions')
dataset = dataset['train'].to_pandas()
assert dataset.columns.tolist() == ['instance_id', 'instruction']
id_column = 'instance_id'
llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
logger.info(f'Config for evaluation: {config}')
metadata = make_metadata(
llm_config,
'browsing_delegation',
args.agent_cls,
args.max_iterations,
args.eval_note,
args.eval_output_dir,
)
if metadata.agent_class not in SUPPORTED_AGENT_CLS:
raise ValueError(
f'Agent class {metadata.agent_class} not supported with AgentDelegation.'
)
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
run_evaluation(
instances,
metadata,
output_file,
args.eval_num_workers,
process_instance,
id_column,
)