|
|
@@ -16,7 +16,7 @@ from evaluation.utils.shared import (
|
|
|
)
|
|
|
from opendevin.controller.agent import Agent
|
|
|
from opendevin.controller.state.state import State
|
|
|
-from opendevin.core.config import LLMConfig, config, get_llm_config_arg, get_parser
|
|
|
+from opendevin.core.config import config, get_llm_config_arg, get_parser
|
|
|
from opendevin.core.logger import get_console_handler
|
|
|
from opendevin.core.logger import opendevin_logger as logger
|
|
|
from opendevin.core.main import run_agent_controller
|
|
|
@@ -36,9 +36,8 @@ AGENT_CLS_TO_INST_SUFFIX = {
|
|
|
}
|
|
|
|
|
|
|
|
|
-def process_instance(
|
|
|
- agent: Agent, instance: Any, metadata: EvalMetadata, reset_logger: bool = True
|
|
|
-):
|
|
|
+def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
|
|
|
+ agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
|
|
|
# create process-specific workspace dir
|
|
|
# we will create a workspace directory for EACH process
|
|
|
# so that different agent don't interfere with each other.
|
|
|
@@ -83,6 +82,7 @@ def process_instance(
|
|
|
run_agent_controller(
|
|
|
agent,
|
|
|
instruction,
|
|
|
+ max_iterations=metadata.max_iterations,
|
|
|
fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
|
|
|
agent.__class__.__name__
|
|
|
],
|
|
|
@@ -143,6 +143,8 @@ if __name__ == '__main__':
|
|
|
default='YOUR_WOLFRAMALPHA_APPID',
|
|
|
)
|
|
|
args, _ = parser.parse_known_args()
|
|
|
+ llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
|
|
|
+ logger.info(f'Config for evaluation: {config}')
|
|
|
|
|
|
dataset = ''
|
|
|
hardness = ''
|
|
|
@@ -172,20 +174,16 @@ if __name__ == '__main__':
|
|
|
toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
|
|
|
|
|
|
id_column = 'qid'
|
|
|
- llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else LLMConfig()
|
|
|
metadata = make_metadata(
|
|
|
llm_config,
|
|
|
f'toolqa-{args.dataset}-{args.hardness}',
|
|
|
args.agent_cls,
|
|
|
- args.max_iterations,
|
|
|
args.eval_note,
|
|
|
args.eval_output_dir,
|
|
|
)
|
|
|
output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
|
|
|
instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
|
|
|
- agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config))
|
|
|
run_evaluation(
|
|
|
- agent,
|
|
|
instances,
|
|
|
metadata,
|
|
|
output_file,
|