run_infer.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import asyncio
  2. import functools
  3. import logging
  4. import os
  5. import pathlib
  6. from typing import Any, Dict
  7. from datasets import load_dataset
  8. from evaluation.swe_bench.swe_env_box import DockerSSHBox
  9. from evaluation.utils.shared import (
  10. EvalMetadata,
  11. make_metadata,
  12. prepare_dataset,
  13. run_evaluation,
  14. )
  15. from opendevin.controller.agent import Agent
  16. from opendevin.controller.state.state import State
  17. from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
  18. from opendevin.core.logger import get_console_handler
  19. from opendevin.core.logger import opendevin_logger as logger
  20. from opendevin.core.main import run_agent_controller
  21. from opendevin.llm.llm import LLM
  22. from .datatypes import TaskState
  23. from .env import SimplifiedEnv
  24. from .prompts import ToolPromptTemplate
  25. from .tasks import Task
  26. config = load_app_config()
  27. def codeact_user_response_mint(state: State, task: Task, task_config: Dict[str, int]):
  28. logger.info(f'Gold reference: {task.reference}')
  29. logger.info(f'Task config: {task_config}')
  30. env = SimplifiedEnv(
  31. agent_state=state,
  32. task=task,
  33. task_config=task_config,
  34. )
  35. last_action = state.history.get_last_action()
  36. result_state: TaskState = env.step(last_action.message or '')
  37. state.task_state = result_state
  38. if not result_state.latest_output:
  39. # Task is finished
  40. msg = '/exit'
  41. else:
  42. msg = result_state.latest_output['content']
  43. logger.info('User response:' + msg)
  44. return msg
  45. AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
  46. 'CodeActAgent': codeact_user_response_mint,
  47. }
  48. AGENT_CLS_TO_INST_SUFFIX = {
  49. 'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
  50. }
  51. def process_instance(
  52. instance: Any,
  53. metadata: EvalMetadata,
  54. reset_logger: bool = True,
  55. ):
  56. agent = Agent.get_cls(metadata.agent_class)(llm=LLM(metadata.llm_config))
  57. workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
  58. # create process-specific workspace dir
  59. workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
  60. pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
  61. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  62. if reset_logger:
  63. # Set up logger
  64. log_file = os.path.join(
  65. metadata.eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
  66. )
  67. # Remove all existing handlers from logger
  68. for handler in logger.handlers[:]:
  69. logger.removeHandler(handler)
  70. # add back the console handler to print ONE line
  71. logger.addHandler(get_console_handler())
  72. logger.info(
  73. f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
  74. )
  75. # Remove all existing handlers from logger
  76. for handler in logger.handlers[:]:
  77. logger.removeHandler(handler)
  78. file_handler = logging.FileHandler(log_file)
  79. file_handler.setFormatter(
  80. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  81. )
  82. logger.addHandler(file_handler)
  83. logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
  84. # use a session id for concurrent processing
  85. sid = instance.task_id + '_' + str(os.getpid())
  86. sandbox = DockerSSHBox(
  87. config=config.sandbox,
  88. persist_sandbox=False,
  89. workspace_mount_path=config.workspace_mount_path,
  90. sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
  91. cache_dir=config.cache_dir,
  92. run_as_devin=config.run_as_devin,
  93. sid=sid,
  94. )
  95. requirements_host_src = 'evaluation/mint/requirements.txt'
  96. requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
  97. sandbox.copy_to(
  98. host_src=requirements_host_src,
  99. sandbox_dest=requirements_sandbox_dest,
  100. recursive=False,
  101. )
  102. logger.info(
  103. f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
  104. )
  105. exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
  106. # Prepare instruction
  107. assert metadata.details is not None
  108. instruction = ToolPromptTemplate(use_tool=True)(
  109. max_total_steps=metadata.max_iterations,
  110. max_propose_solution=metadata.details['max_propose_solution'],
  111. in_context_example=instance.in_context_example(
  112. use_tool=True, with_feedback=False
  113. ),
  114. task_prompt='Task:\n' + instance.prompt,
  115. )
  116. instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the concise RESULT inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
  117. # NOTE: You can actually set slightly different instruction for different agents
  118. instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
  119. # Here's how you can run the agent (similar to the `main` function) and get the final task state
  120. fake_user_response_fn = functools.partial(
  121. AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[agent.__class__.__name__],
  122. task=instance,
  123. task_config={
  124. 'max_iterations': metadata.max_iterations,
  125. 'max_propose_solution': metadata.details['max_propose_solution'],
  126. },
  127. )
  128. state: State | None = asyncio.run(
  129. run_agent_controller(
  130. agent,
  131. instruction,
  132. max_iterations=metadata.max_iterations,
  133. max_budget_per_task=config.max_budget_per_task,
  134. fake_user_response_fn=fake_user_response_fn,
  135. sandbox=sandbox,
  136. sid=sid,
  137. )
  138. )
  139. if state is None:
  140. raise ValueError('State should not be None.')
  141. task_state = None
  142. if hasattr(state, 'task_state'):
  143. task_state = state.task_state
  144. logger.info('Task state: ' + str(task_state.to_dict()))
  145. metrics = state.metrics.get() if state.metrics else None
  146. # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
  147. # for compatibility with the existing output format, we can remake the pairs here
  148. # remove when it becomes unnecessary
  149. histories = state.history.compatibility_for_eval_history_pairs()
  150. # Save the output
  151. output = {
  152. 'id': instance.task_id,
  153. 'instance': instance.to_dict(),
  154. 'instruction': instruction,
  155. 'metadata': metadata.model_dump(),
  156. 'history': histories,
  157. 'metrics': metrics,
  158. 'error': state.last_error if state and state.last_error else None,
  159. 'test_result': task_state.success if task_state else False,
  160. }
  161. # Close the sandbox
  162. sandbox.close()
  163. return output
  164. if __name__ == '__main__':
  165. parser = get_parser()
  166. parser.add_argument(
  167. '--subset',
  168. default='math',
  169. choices=['math', 'gsm8k', 'mmlu', 'theoremqa', 'mbpp', 'humaneval'],
  170. type=str,
  171. help='subset of the dataset to be used',
  172. )
  173. parser.add_argument(
  174. '--max-propose-solution',
  175. default=2,
  176. type=int,
  177. help='maximum number of times the agent can propose a solution',
  178. )
  179. args, _ = parser.parse_known_args()
  180. # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
  181. # so we don't need to manage file uploading to OpenDevin's repo
  182. mint_dataset = load_dataset(
  183. 'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
  184. )
  185. logger.info(f'Evaluating MINT - {args.subset} subset')
  186. mint_tests = mint_dataset.to_pandas()
  187. id_column = 'id'
  188. llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
  189. logger.info(f'Config for evaluation: {config}')
  190. metadata = make_metadata(
  191. llm_config,
  192. args.dataset_name,
  193. args.agent_cls,
  194. args.max_iterations,
  195. args.eval_note,
  196. args.eval_output_dir,
  197. details={'max_propose_solution': args.max_propose_solution},
  198. )
  199. output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
  200. instances = prepare_dataset(mint_dataset, output_file, args.eval_n_limit, id_column)
  201. run_evaluation(
  202. instances,
  203. metadata,
  204. output_file,
  205. args.eval_num_workers,
  206. process_instance,
  207. id_column,
  208. )