run_infer.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. """Implements evaluation of agents on ML-Bench, a benchmark for assessing the effectiveness of
  2. Large Language Models (LLMs) in leveraging existing functions in open-source libraries for
  3. machine learning tasks. The benchmark is introduced in the paper "ML-Bench: Evaluating Large
  4. Language Models for Code Generation in Repository-Level Machine Learning Tasks"
  5. (https://arxiv.org/abs/2311.09835).
  6. Please see https://ghcr.io/super-dainiu/ml_bench and https://huggingface.co/datasets/super-dainiu/ml-bench
  7. for more details on the dataset and docker image used in this evaluation script.
  8. TODOs:
  9. - Support additional evaluation settings, such as providing raw README content or using a
  10. retriever to extract relevant segments.
  11. - Clean up the code and docker image used for evaluation.
  12. """
  13. import asyncio
  14. import logging
  15. import os
  16. import pathlib
  17. from typing import Any
  18. from datasets import load_dataset
  19. from evaluation.utils.shared import (
  20. EvalMetadata,
  21. codeact_user_response,
  22. make_metadata,
  23. prepare_dataset,
  24. run_evaluation,
  25. )
  26. from opendevin.controller.agent import Agent
  27. from opendevin.controller.state.state import State
  28. from opendevin.core.config import get_llm_config_arg, get_parser, load_app_config
  29. from opendevin.core.logger import get_console_handler
  30. from opendevin.core.logger import opendevin_logger as logger
  31. from opendevin.core.main import run_agent_controller
  32. from opendevin.llm.llm import LLM
  33. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  34. config = load_app_config()
  35. AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
  36. 'CodeActAgent': codeact_user_response,
  37. }
  38. AGENT_CLS_TO_INST_SUFFIX = {
  39. 'CodeActAgent': 'When you think you have completed the task, please run the following command: <execute_bash> exit </execute_bash>.\n'
  40. }
  41. ID2CONDA = {
  42. 1: 'dgl_DS',
  43. 2: 'bert_DS',
  44. 3: 'lavis_DS',
  45. 4: 'if_DS',
  46. 5: 'V2V_DS',
  47. 6: 'esm_DS',
  48. 7: 'OP_DS',
  49. 8: 'TSL_DS',
  50. 9: 'EAP_DS',
  51. 10: 'PG_DS',
  52. 11: 'PIM_DS',
  53. 12: 'AD2_DS',
  54. 13: 'L3_DS',
  55. 14: 'MZ2_DS',
  56. 15: 'GSA2_DS',
  57. }
  58. def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
  59. agent = Agent.get_cls(metadata.agent_class)(llm=LLM(config=metadata.llm_config))
  60. old_workspace_mount_path = config.workspace_mount_path
  61. old_workspace_base = config.workspace_base
  62. try:
  63. workspace_mount_path = os.path.join(
  64. config.workspace_mount_path, '_eval_workspace'
  65. )
  66. # create process-specific workspace dir
  67. # so that different agent don't interfere with each other.
  68. workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
  69. pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
  70. # reset workspace to config
  71. config.workspace_base = workspace_mount_path
  72. config.workspace_mount_path = workspace_mount_path
  73. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  74. if reset_logger:
  75. # Set up logger
  76. log_file = os.path.join(
  77. metadata.eval_output_dir,
  78. 'logs',
  79. f"instance_{instance['id']}_pid_{os.getpid()}.log",
  80. )
  81. # Remove all existing handlers from logger
  82. for handler in logger.handlers[:]:
  83. logger.removeHandler(handler)
  84. # add back the console handler to print ONE line
  85. logger.addHandler(get_console_handler())
  86. logger.info(
  87. f"Starting evaluation for instance {instance['id']}.\nLOG: tail -f {log_file}"
  88. )
  89. # Remove all existing handlers from logger
  90. for handler in logger.handlers[:]:
  91. logger.removeHandler(handler)
  92. file_handler = logging.FileHandler(log_file)
  93. file_handler.setFormatter(
  94. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  95. )
  96. logger.addHandler(file_handler)
  97. logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
  98. # Create a sandbox, using the instance ID and PID as the session ID to avoid conflicts
  99. sid = str(instance['id']) + '_' + str(os.getpid())
  100. sandbox = DockerSSHBox(
  101. config=config.sandbox,
  102. persist_sandbox=False,
  103. workspace_mount_path=config.workspace_mount_path,
  104. sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
  105. cache_dir=config.cache_dir,
  106. run_as_devin=config.run_as_devin,
  107. sid=sid,
  108. )
  109. # Set up the task environment
  110. sandbox.execute(f'conda activate {ID2CONDA[instance["github_id"]]}')
  111. # Clone the task repo into the sandbox
  112. repo_url = instance['github']
  113. repo_name = repo_url.split('/')[-1]
  114. sandbox.execute(f'git clone {repo_url} /workspace/{repo_name}')
  115. sandbox.execute(f'chmod -R 777 /workspace/{repo_name}')
  116. # Navigate to the task's code path
  117. task_path = os.path.join('/workspace', repo_name, instance['path'][2:])
  118. sandbox.execute(f'cd {task_path}')
  119. # Prepare the task instruction
  120. instruction = (
  121. f'Please complete the Machine Learning task in the following repository: {repo_name}\n\n'
  122. f'The task is: {instance["task"]}\n\n'
  123. f'{instance["instruction"]}\n\n'
  124. 'You should create a script named `run.sh` under the specified path in the repo to run the task.\n\n'
  125. f'You can find the task repo at: {task_path}\n\n'
  126. + (
  127. 'Here is the prefix code for the task:\n'
  128. '```bash\n'
  129. f'{instance["prefix_code"]}\n'
  130. '```\n\n'
  131. if instance['prefix_code']
  132. else ''
  133. )
  134. + 'You should terminate the subprocess after running the task (e.g., call subprocess.Popen(args).wait()).'
  135. )
  136. instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
  137. # Run the agent
  138. state: State | None = asyncio.run(
  139. run_agent_controller(
  140. agent,
  141. instruction,
  142. max_iterations=metadata.max_iterations,
  143. max_budget_per_task=config.max_budget_per_task,
  144. fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
  145. agent.__class__.__name__
  146. ),
  147. sandbox=sandbox,
  148. sid=sid,
  149. )
  150. )
  151. assert state is not None
  152. metrics = state.metrics.get() if state.metrics else {}
  153. # Evaluate the agent's script
  154. eval_script = os.path.join(task_path, 'run.sh')
  155. logger.info(f'Running evaluation script: {eval_script}')
  156. try:
  157. _, eval_script_content = sandbox.execute(f'cat {eval_script}')
  158. except Exception as e:
  159. logger.error(f'Error reading evaluation script: {e}')
  160. eval_script_content = ''
  161. try:
  162. exit_code, eval_output = sandbox.execute(
  163. f'timeout 120s conda run -n {ID2CONDA[instance["github_id"]]} bash {eval_script}',
  164. timeout=600,
  165. )
  166. except Exception as e:
  167. logger.error(f'Error running evaluation script: {e}')
  168. exit_code = -1
  169. eval_output = ''
  170. if exit_code != 0 and exit_code != 124:
  171. logger.warning(f'Evaluation script failed with exit code {exit_code}')
  172. logger.warning(f'Output: {eval_output}')
  173. metrics['success'] = int(
  174. 'KeyboardInterrupt' in eval_output
  175. ) # super-dainiu: assume ``KeyboardInterrupt`` is a success as is done in ML-Bench
  176. else:
  177. logger.info(f'Evaluation script succeeded with exit code {exit_code}')
  178. logger.info(f'Output: {eval_output}')
  179. metrics['success'] = 1
  180. # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
  181. # for compatibility with the existing output format, we can remake the pairs here
  182. # remove when it becomes unnecessary
  183. histories = state.history.compatibility_for_eval_history_pairs()
  184. # Save the output
  185. output = {
  186. 'instance_id': instance['id'],
  187. 'repo': repo_url,
  188. 'instruction': instruction,
  189. 'metadata': metadata.model_dump(),
  190. 'history': histories,
  191. 'eval_script': eval_script_content,
  192. 'eval_exit_code': exit_code,
  193. 'eval_output': eval_output,
  194. 'metrics': metrics,
  195. }
  196. except Exception as e:
  197. logger.error(f'Error processing instance {instance["id"]}: {e}')
  198. raise
  199. finally:
  200. config.workspace_mount_path = old_workspace_mount_path
  201. config.workspace_base = old_workspace_base
  202. # Shutdown the sandbox
  203. sandbox.close()
  204. return output
  205. if __name__ == '__main__':
  206. parser = get_parser()
  207. parser.add_argument(
  208. '-s',
  209. '--eval-split',
  210. type=str,
  211. default='quarter',
  212. choices=['full', 'quarter'],
  213. help='data split to evaluate on, either full or quarter',
  214. )
  215. args, _ = parser.parse_known_args()
  216. data_split = args.eval_split
  217. # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
  218. # so we don't need to manage file uploading to OpenDevin's repo
  219. ml_bench = load_dataset('super-dainiu/ml-bench', split=data_split).to_pandas()
  220. id_column = 'instance_id'
  221. llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
  222. logger.info(f'Config for evaluation: {config}')
  223. metadata = make_metadata(
  224. llm_config,
  225. args.dataset_name,
  226. args.agent_cls,
  227. args.max_iterations,
  228. args.eval_note,
  229. args.eval_output_dir,
  230. )
  231. output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
  232. instances = prepare_dataset(ml_bench, output_file, args.eval_n_limit, id_column)
  233. run_evaluation(
  234. instances,
  235. metadata,
  236. output_file,
  237. args.eval_num_workers,
  238. process_instance,
  239. id_column,
  240. )