run_infer.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. import asyncio
  2. import json
  3. import logging
  4. import os
  5. import browsergym.webarena # noqa F401 register webarena tasks as gym environments
  6. import gymnasium as gym
  7. import pandas as pd
  8. from evaluation.utils.shared import (
  9. EvalMetadata,
  10. make_metadata,
  11. prepare_dataset,
  12. run_evaluation,
  13. )
  14. from opendevin.controller.agent import Agent
  15. from opendevin.controller.state.state import State
  16. from opendevin.core.config import config, get_llm_config_arg, parse_arguments
  17. from opendevin.core.logger import get_console_handler
  18. from opendevin.core.logger import opendevin_logger as logger
  19. from opendevin.core.main import run_agent_controller
  20. from opendevin.llm.llm import LLM
  21. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  22. from opendevin.runtime.tools import RuntimeTool
  23. SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
  24. docker_ssh_box: DockerSSHBox | None = None
  25. def get_sandbox():
  26. global docker_ssh_box
  27. if docker_ssh_box is None:
  28. docker_ssh_box = DockerSSHBox()
  29. return docker_ssh_box
  30. def process_instance(
  31. instance: pd.Series,
  32. metadata: EvalMetadata,
  33. reset_logger: bool = True,
  34. ):
  35. # Create the agent
  36. agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
  37. env_id = instance.id
  38. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  39. if reset_logger:
  40. # Set up logger
  41. log_file = os.path.join(
  42. metadata.eval_output_dir, 'logs', f'instance_{env_id}.log'
  43. )
  44. # Remove all existing handlers from logger
  45. for handler in logger.handlers[:]:
  46. logger.removeHandler(handler)
  47. # add back the console handler to print ONE line
  48. logger.addHandler(get_console_handler())
  49. logger.info(
  50. f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
  51. )
  52. # Remove all existing handlers from logger
  53. for handler in logger.handlers[:]:
  54. logger.removeHandler(handler)
  55. file_handler = logging.FileHandler(log_file)
  56. file_handler.setFormatter(
  57. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  58. )
  59. logger.addHandler(file_handler)
  60. else:
  61. logger.info(f'Starting evaluation for instance {env_id}.')
  62. # Here's how you can run the agent (similar to the `main` function) and get the final task state
  63. runtime_tools_config = {
  64. RuntimeTool.BROWSER: {
  65. 'browsergym_eval': env_id,
  66. 'browsergym_eval_save_dir': metadata.eval_output_dir,
  67. }
  68. }
  69. state: State | None = asyncio.run(
  70. run_agent_controller(
  71. agent,
  72. 'PLACEHOLDER_GOAL',
  73. max_iterations=metadata.max_iterations,
  74. max_budget_per_task=config.max_budget_per_task,
  75. runtime_tools_config=runtime_tools_config,
  76. sandbox=get_sandbox(),
  77. sid=env_id,
  78. )
  79. )
  80. # ======= Attempt to evaluate the agent's environment impact =======
  81. # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
  82. # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
  83. if state is None:
  84. raise ValueError('State should not be None.')
  85. metrics = state.metrics.get() if state.metrics else None
  86. browsergym_eval_dir = os.path.join(metadata.eval_output_dir, env_id.split('/')[1])
  87. # read goal
  88. with open(
  89. os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
  90. ) as f:
  91. instruction = f.read()
  92. # read reward
  93. with open(
  94. os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
  95. ) as f:
  96. rewards = json.load(f)
  97. reward = max(rewards)
  98. # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
  99. # for compatibility with the existing output format, we can remake the pairs here
  100. # remove when it becomes unnecessary
  101. histories = state.history.compatibility_for_eval_history_pairs()
  102. # Save the output
  103. output = {
  104. 'instance_id': env_id,
  105. 'instruction': instruction,
  106. 'metadata': metadata.model_dump(),
  107. 'history': histories,
  108. 'metrics': metrics,
  109. 'error': state.last_error if state and state.last_error else None,
  110. 'test_result': reward,
  111. }
  112. return output
  113. if __name__ == '__main__':
  114. args = parse_arguments()
  115. env_ids = [
  116. id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
  117. ]
  118. dataset = pd.DataFrame(
  119. {
  120. 'id': [
  121. id
  122. for id in gym.envs.registry.keys()
  123. if id.startswith('browsergym/miniwob')
  124. ]
  125. }
  126. )
  127. id_column = 'id'
  128. llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
  129. logger.info(f'Config for evaluation: {config}')
  130. metadata = make_metadata(
  131. llm_config,
  132. args.dataset_name,
  133. args.agent_cls,
  134. args.max_iterations,
  135. args.eval_note,
  136. args.eval_output_dir,
  137. )
  138. output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
  139. instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
  140. _ = get_sandbox() # Initialize the sandbox
  141. run_evaluation(
  142. instances,
  143. metadata,
  144. output_file,
  145. args.eval_num_workers,
  146. process_instance,
  147. id_column,
  148. )