run_infer.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. import asyncio
  2. import json
  3. import logging
  4. import os
  5. import pathlib
  6. import subprocess
  7. import time
  8. import browsergym.webarena # noqa F401 register webarena tasks as gym environments
  9. import gymnasium as gym
  10. from tqdm import tqdm
  11. from opendevin.controller.state.state import State
  12. from opendevin.core.config import args, config, get_llm_config_arg
  13. from opendevin.core.logger import get_console_handler
  14. from opendevin.core.logger import opendevin_logger as logger
  15. from opendevin.core.main import main
  16. from opendevin.events.serialization.event import event_to_dict
  17. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  18. from opendevin.runtime.tools import RuntimeTool
  19. SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
  20. def process_instance(
  21. env_id: str,
  22. metadata: dict,
  23. eval_output_dir: str,
  24. docker_sandbox: DockerSSHBox,
  25. reset_logger: bool = True,
  26. ):
  27. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  28. if reset_logger:
  29. # Set up logger
  30. log_file = os.path.join(eval_output_dir, 'logs', f'instance_{env_id}.log')
  31. # Remove all existing handlers from logger
  32. for handler in logger.handlers[:]:
  33. logger.removeHandler(handler)
  34. # add back the console handler to print ONE line
  35. logger.addHandler(get_console_handler())
  36. logger.info(
  37. f'Starting evaluation for instance {env_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
  38. )
  39. # Remove all existing handlers from logger
  40. for handler in logger.handlers[:]:
  41. logger.removeHandler(handler)
  42. file_handler = logging.FileHandler(log_file)
  43. file_handler.setFormatter(
  44. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  45. )
  46. logger.addHandler(file_handler)
  47. else:
  48. logger.info(f'Starting evaluation for instance {env_id}.')
  49. # Here's how you can run the agent (similar to the `main` function) and get the final task state
  50. runtime_tools_config = {
  51. RuntimeTool.BROWSER: {
  52. 'browsergym_eval': env_id,
  53. 'browsergym_eval_save_dir': eval_output_dir,
  54. }
  55. }
  56. state: State = asyncio.run(
  57. main(
  58. 'PLACEHOLDER_GOAL',
  59. runtime_tools_config=runtime_tools_config,
  60. sandbox=docker_sandbox,
  61. )
  62. )
  63. # ======= Attempt to evaluate the agent's environment impact =======
  64. # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
  65. # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
  66. if state is None:
  67. raise ValueError('State should not be None.')
  68. metrics = state.metrics.get() if state.metrics else None
  69. browsergym_eval_dir = os.path.join(eval_output_dir, env_id.split('/')[1])
  70. # read goal
  71. with open(
  72. os.path.join(browsergym_eval_dir, 'goal.txt'), 'r', encoding='utf-8'
  73. ) as f:
  74. instruction = f.read()
  75. # read reward
  76. with open(
  77. os.path.join(browsergym_eval_dir, 'rewards.json'), 'r', encoding='utf-8'
  78. ) as f:
  79. rewards = json.load(f)
  80. reward = max(rewards)
  81. # Save the output
  82. output = {
  83. 'instance_id': env_id,
  84. 'instruction': instruction,
  85. 'metadata': metadata,
  86. 'history': [
  87. (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
  88. ],
  89. 'metrics': metrics,
  90. 'error': state.error if state and state.error else None,
  91. 'test_result': reward,
  92. }
  93. return output
  94. if __name__ == '__main__':
  95. env_ids = [
  96. id for id in gym.envs.registry.keys() if id.startswith('browsergym/webarena')
  97. ]
  98. # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
  99. # for details of how to set `llm_config`
  100. if args.llm_config:
  101. specified_llm_config = get_llm_config_arg(args.llm_config)
  102. if specified_llm_config:
  103. config.llm = specified_llm_config
  104. logger.info(f'Config for evaluation: {config}')
  105. # TEST METADATA
  106. agent_class = args.agent_cls
  107. assert agent_class in SUPPORTED_AGENT_CLS, f'Unsupported agent class: {agent_class}'
  108. model_name = config.llm.model.split('/')[-1]
  109. max_iterations = args.max_iterations
  110. eval_note = ''
  111. if args.eval_note is not None:
  112. eval_note += '_N_' + args.eval_note
  113. eval_output_dir = os.path.join(
  114. args.eval_output_dir,
  115. 'webarena',
  116. agent_class,
  117. model_name + '_maxiter_' + str(max_iterations) + eval_note,
  118. )
  119. pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
  120. pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
  121. parents=True, exist_ok=True
  122. )
  123. logger.info(f'Using evaluation output directory: {eval_output_dir}')
  124. metadata = {
  125. 'agent_class': agent_class,
  126. 'model_name': model_name,
  127. 'max_iterations': max_iterations,
  128. 'eval_output_dir': eval_output_dir,
  129. 'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
  130. # get the commit id of current repo for reproducibility
  131. 'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
  132. .decode('utf-8')
  133. .strip(),
  134. }
  135. logger.info(f'Metadata: {metadata}')
  136. with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
  137. json.dump(metadata, f)
  138. # LIMIT EVALUATION
  139. eval_n_limit = args.eval_n_limit
  140. if eval_n_limit:
  141. env_ids = env_ids[:eval_n_limit]
  142. logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
  143. # OUTPUT FILE
  144. output_file = os.path.join(eval_output_dir, 'output.jsonl')
  145. logger.info(f'Writing evaluation output to {output_file}')
  146. finished_instance_ids = set()
  147. if os.path.exists(output_file):
  148. with open(output_file, 'r') as f:
  149. for line in f:
  150. data = json.loads(line)
  151. finished_instance_ids.add(data['instance_id'])
  152. logger.warning(
  153. f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
  154. )
  155. output_fp = open(output_file, 'a')
  156. logger.info(
  157. f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
  158. )
  159. # =============================================
  160. # filter out finished instances
  161. new_env_ids = []
  162. for idx in env_ids:
  163. if idx in finished_instance_ids:
  164. logger.info(f'Skipping instance {idx} as it is already finished.')
  165. continue
  166. new_env_ids.append(idx)
  167. env_ids = new_env_ids
  168. logger.info(
  169. f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(env_ids)}'
  170. )
  171. # =============================================
  172. docker_sandbox = DockerSSHBox()
  173. for env_id in tqdm(env_ids):
  174. try:
  175. output = process_instance(
  176. env_id=env_id,
  177. metadata=metadata,
  178. eval_output_dir=eval_output_dir,
  179. docker_sandbox=docker_sandbox,
  180. reset_logger=False,
  181. )
  182. output_fp.write(json.dumps(output) + '\n')
  183. output_fp.flush()
  184. except Exception as e:
  185. logger.error(f'Error processing instance {env_id}: {e}')
  186. output_fp.close()
  187. logger.info('Evaluation finished.')