run_infer.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import asyncio
  2. import logging
  3. import os
  4. import re
  5. import shutil
  6. import docker
  7. import pandas as pd
  8. from datasets import load_dataset
  9. from evaluation.agent_bench.helper import (
  10. FAKE_RESPONSES,
  11. INST_SUFFIXES,
  12. compare_results,
  13. create_sh_file,
  14. )
  15. from evaluation.utils.shared import (
  16. EvalMetadata,
  17. make_metadata,
  18. prepare_dataset,
  19. run_evaluation,
  20. )
  21. from opendevin.controller.agent import Agent
  22. from opendevin.controller.state.state import State
  23. from opendevin.core.config import config, get_llm_config_arg, parse_arguments
  24. from opendevin.core.logger import get_console_handler
  25. from opendevin.core.logger import opendevin_logger as logger
  26. from opendevin.core.main import run_agent_controller
  27. from opendevin.events.action import CmdRunAction, MessageAction
  28. from opendevin.llm.llm import LLM
  29. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  30. def process_instance(
  31. instance: pd.Series,
  32. metadata: EvalMetadata,
  33. reset_logger: bool = True,
  34. ):
  35. # Create the agent
  36. agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
  37. inst_id = instance.instance_id
  38. question = instance.description
  39. # create a directory for the instance's workspace
  40. instance_workspace = str(os.path.join(config.workspace_base, inst_id))
  41. container_inst_workspace = str(
  42. os.path.join(config.workspace_mount_path_in_sandbox, inst_id)
  43. )
  44. if os.path.exists(instance_workspace):
  45. shutil.rmtree(instance_workspace)
  46. os.makedirs(instance_workspace, exist_ok=True)
  47. # Set up the logger properly, so you can run multiprocessing to parallel the evaluation
  48. if reset_logger:
  49. # Set up logger
  50. log_file = os.path.join(
  51. metadata.eval_output_dir, 'logs', f'instance_{inst_id}.log'
  52. )
  53. # Remove all existing handlers from logger
  54. for handler in logger.handlers[:]:
  55. logger.removeHandler(handler)
  56. # add back the console handler to print ONE line
  57. logger.addHandler(get_console_handler())
  58. logger.info(
  59. f'Starting evaluation for instance {inst_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
  60. )
  61. # Remove all existing handlers from logger
  62. for handler in logger.handlers[:]:
  63. logger.removeHandler(handler)
  64. file_handler = logging.FileHandler(log_file)
  65. file_handler.setFormatter(
  66. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  67. )
  68. logger.addHandler(file_handler)
  69. # =============================================
  70. # build instruction
  71. # =============================================
  72. # Prepare instruction
  73. instruction = (
  74. f'Please fix the following issue.\n'
  75. 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
  76. 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
  77. 'For example: The answer to the question is <solution> 42 </solution>.\n'
  78. '# Problem \n'
  79. f'{question}\n\n'
  80. )
  81. instruction += (
  82. 'IMPORTANT: You should ONLY interact with the environment provided '
  83. 'to you AND NEVER ASK FOR HUMAN HELP.\n'
  84. )
  85. # NOTE: You can actually set slightly different instruction for different agents
  86. instruction += INST_SUFFIXES[agent.__class__.__name__]
  87. # =============================================
  88. # create sandbox and run the agent
  89. # =============================================
  90. sandbox = DockerSSHBox()
  91. sandbox.execute(f'cd {inst_id}')
  92. init_cmd = instance.init
  93. if init_cmd is not None:
  94. scpt_name = f'{instance.instance_id}_init.sh'
  95. scpt_path = os.path.join(container_inst_workspace, scpt_name)
  96. host_scpt_path = os.path.join(instance_workspace, scpt_name)
  97. create_sh_file(host_scpt_path, init_cmd)
  98. logger.info(f'Running init script: {scpt_path}')
  99. _, init_res = sandbox.execute(scpt_path)
  100. logger.info(f'Init script result: {init_res}')
  101. # Here's how you can run the agent (similar to the `main` function) and get the final task state
  102. state: State | None = asyncio.run(
  103. run_agent_controller(
  104. agent,
  105. instruction,
  106. max_iterations=metadata.max_iterations,
  107. fake_user_response_fn=FAKE_RESPONSES[agent.__class__.__name__],
  108. sandbox=sandbox,
  109. sid=inst_id,
  110. )
  111. )
  112. if state is None:
  113. raise ValueError('State should not be None.')
  114. # get the ground truth
  115. # OSBenchSSHBox.get_ground_truth(instance, state)
  116. # =============================================
  117. # result evaluation
  118. # =============================================
  119. agent_answer = ''
  120. get_agent_result_cmd = instance.get_agent_result
  121. if get_agent_result_cmd is not None:
  122. scpt_name = f'{instance.instance_id}_get_agent_result.sh'
  123. scpt_path = os.path.join(container_inst_workspace, scpt_name)
  124. host_scpt_path = os.path.join(instance_workspace, scpt_name)
  125. create_sh_file(host_scpt_path, get_agent_result_cmd)
  126. logger.info(f'Running get agent result cmd: {scpt_path}')
  127. _, agent_answer = sandbox.execute(scpt_path)
  128. else:
  129. logger.info('Retrieving agent answer from history.')
  130. raw_ans = ''
  131. # retrieve the last agent message or thought
  132. for event in state.history.get_events(reverse=True):
  133. if isinstance(event, MessageAction) and event.source == 'agent':
  134. raw_ans = event.content
  135. elif isinstance(event, CmdRunAction) and event.source == 'agent':
  136. raw_ans = event.thought
  137. # parse the answer for a solution tag
  138. agent_answer = re.findall(r'<solution>(.*?)</solution>', raw_ans)
  139. if len(agent_answer) == 0:
  140. logger.warning(f'Failed to parse model answer: {raw_ans}')
  141. agent_answer = raw_ans
  142. else:
  143. agent_answer = agent_answer[0]
  144. final_ans = ''
  145. if instance.ground_truth is not None:
  146. final_ans = instance.ground_truth
  147. else:
  148. get_ground_truth_cmd = instance.get_ground_truth
  149. if get_ground_truth_cmd is not None:
  150. scpt_name = f'{instance.instance_id}_get_ground_truth.sh'
  151. scpt_path = os.path.join(container_inst_workspace, scpt_name)
  152. host_scpt_path = os.path.join(instance_workspace, scpt_name)
  153. create_sh_file(host_scpt_path, get_ground_truth_cmd)
  154. logger.info(f'Running get ground truth cmd: {scpt_path}')
  155. sandbox.execute(f'cd {container_inst_workspace}')
  156. _, final_ans = sandbox.execute(scpt_path)
  157. comparison_method = instance.comparison_method
  158. logger.info(
  159. f'Final message: {agent_answer} | Ground truth: {final_ans} | Comparison method: {comparison_method}'
  160. )
  161. test_result = compare_results(comparison_method, agent_answer, final_ans)
  162. # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
  163. # for compatibility with the existing output format, we can remake the pairs here
  164. # remove when it becomes unnecessary
  165. histories = state.history.compatibility_for_eval_history_pairs()
  166. metrics = state.metrics.get() if state.metrics else None
  167. # Save the output
  168. output = {
  169. 'instance_id': inst_id,
  170. 'instance': instance.to_dict(),
  171. 'instruction': instruction,
  172. 'metadata': metadata.model_dump(),
  173. 'history': histories,
  174. 'metrics': metrics,
  175. 'error': state.last_error if state and state.last_error else None,
  176. 'test_result': {
  177. 'agent_answer': agent_answer,
  178. 'final_answer': final_ans,
  179. 'check_method': comparison_method,
  180. 'result': test_result,
  181. },
  182. }
  183. # clean up
  184. if os.path.exists(instance_workspace):
  185. shutil.rmtree(instance_workspace)
  186. # Close the sandbox
  187. try:
  188. sandbox.close()
  189. except docker.errors.NotFound as e:
  190. logger.error(f'Failed to close sandbox: {e}')
  191. return output
  192. if __name__ == '__main__':
  193. id_column = 'instance_id'
  194. args = parse_arguments()
  195. dataset = load_dataset('iFurySt/AgentBench')
  196. agent_bench_tests = dataset['osbench'].to_pandas()
  197. llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
  198. logger.info(f'Config for evaluation: {config}')
  199. metadata = make_metadata(
  200. llm_config,
  201. args.dataset_name,
  202. args.agent_cls,
  203. args.max_iterations,
  204. args.eval_note,
  205. args.eval_output_dir,
  206. )
  207. output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
  208. instances = prepare_dataset(dataset, output_file, args.eval_n_limit, id_column)
  209. run_evaluation(
  210. instances,
  211. metadata,
  212. output_file,
  213. args.eval_num_workers,
  214. process_instance,
  215. id_column,
  216. )