run_infer.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import asyncio
  2. import logging
  3. import os
  4. import pathlib
  5. from typing import Any
  6. import pandas as pd
  7. from evaluation.utils.shared import (
  8. EvalMetadata,
  9. codeact_user_response,
  10. make_metadata,
  11. monologue_user_response,
  12. prepare_dataset,
  13. run_evaluation,
  14. )
  15. from opendevin.controller.agent import Agent
  16. from opendevin.controller.state.state import State
  17. from opendevin.core.config import config, get_llm_config_arg, get_parser
  18. from opendevin.core.logger import get_console_handler
  19. from opendevin.core.logger import opendevin_logger as logger
  20. from opendevin.core.main import run_agent_controller
  21. from opendevin.llm.llm import LLM
  22. from .utils import download_data, download_tools, encode_question, eval_answer, get_data
  23. AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
  24. 'CodeActAgent': codeact_user_response,
  25. 'MonologueAgent': monologue_user_response,
  26. }
  27. AGENT_CLS_TO_INST_SUFFIX = {
  28. 'CodeActAgent': 'When you think you have completed the request, please run the following command: <execute_bash> exit </execute_bash>.\n'
  29. }
  30. def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool = True):
  31. agent = Agent.get_cls(metadata.agent_class)(llm=LLM(llm_config=metadata.llm_config))
  32. # create process-specific workspace dir
  33. # we will create a workspace directory for EACH process
  34. # so that different agent don't interfere with each other.
  35. workspace_mount_path = config.workspace_mount_path
  36. pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
  37. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  38. eval_output_dir = metadata.eval_output_dir
  39. qid = instance.qid
  40. question = instance.question
  41. answer = instance.answer
  42. if reset_logger:
  43. # Set up logger
  44. log_file = os.path.join(eval_output_dir, 'logs', f'instance_{qid}.log')
  45. # Remove all existing handlers from logger
  46. for handler in logger.handlers[:]:
  47. logger.removeHandler(handler)
  48. # add back the console handler to print ONE line
  49. logger.addHandler(get_console_handler())
  50. logger.info(
  51. f'Starting evaluation for instance {qid}.\nHint: run "tail -f {log_file}" to see live logs in a separate shell'
  52. )
  53. # Remove all existing handlers from logger
  54. for handler in logger.handlers[:]:
  55. logger.removeHandler(handler)
  56. file_handler = logging.FileHandler(log_file)
  57. file_handler.setFormatter(
  58. logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  59. )
  60. logger.addHandler(file_handler)
  61. logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
  62. # Prepare instruction
  63. instruction = encode_question(question)
  64. instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
  65. # NOTE: You can actually set slightly different instruction for different agents
  66. instruction += AGENT_CLS_TO_INST_SUFFIX[agent.__class__.__name__]
  67. # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
  68. # Here's how you can run the agent (similar to the `main` function) and get the final task state
  69. state: State | None = asyncio.run(
  70. run_agent_controller(
  71. agent,
  72. instruction,
  73. max_iterations=metadata.max_iterations,
  74. fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
  75. agent.__class__.__name__
  76. ],
  77. sid=qid,
  78. )
  79. )
  80. # ======= Attempt to evaluate the agent's edits =======
  81. # If you are working on simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
  82. # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
  83. if state is None:
  84. raise ValueError('State should not be None.')
  85. # retrieve the last message from the agent
  86. model_answer_raw = state.history.get_last_agent_message()
  87. # attempt to parse model_answer
  88. correct = eval_answer(str(model_answer_raw), str(answer))
  89. logger.info(f'Final message: {model_answer_raw} | Correctness: {correct}')
  90. metrics = state.metrics.get() if state.metrics else None
  91. # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
  92. # for compatibility with the existing output format, we can remake the pairs here
  93. # remove when it becomes unnecessary
  94. histories = state.history.compatibility_for_eval_history_pairs()
  95. # Save the output
  96. output = {
  97. 'qid': qid,
  98. 'text': model_answer_raw,
  99. 'correct': correct,
  100. 'answer_id': 'None',
  101. 'model_id': metadata.model_name,
  102. 'metadata': metadata.model_dump(),
  103. 'history': histories,
  104. 'metrics': metrics,
  105. 'error': state.last_error if state and state.last_error else None,
  106. }
  107. return output
  108. if __name__ == '__main__':
  109. parser = get_parser()
  110. parser.add_argument(
  111. '--dataset',
  112. type=str,
  113. help='Which dataset to evaluate from ToolQA. ToolQA contains 8 datasets, namely agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp. For example, the default is --dataset flight.',
  114. default='flight',
  115. )
  116. parser.add_argument(
  117. '--hardness',
  118. type=str,
  119. help='Which level of difficulty to evaluate from ToolQA. ToolQA contains 2 levels of hardness, namely easy and hard. For example, the default is --hardness easy.',
  120. default='easy',
  121. )
  122. parser.add_argument(
  123. '--wolfram_alpha_appid',
  124. type=str,
  125. help='wolfram alpha appid to use for wolfram alpha related tests',
  126. default='YOUR_WOLFRAMALPHA_APPID',
  127. )
  128. args, _ = parser.parse_known_args()
  129. llm_config = get_llm_config_arg(args.llm_config) if args.llm_config else config.llm
  130. logger.info(f'Config for evaluation: {config}')
  131. dataset = ''
  132. hardness = ''
  133. dataset_choices = [
  134. 'agenda',
  135. 'airbnb',
  136. 'coffee',
  137. 'dblp',
  138. 'flight',
  139. 'gsm8k',
  140. 'scirex',
  141. 'yelp',
  142. 'genda',
  143. ]
  144. if args.dataset not in dataset_choices:
  145. raise ValueError(
  146. 'Please choose from agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp for dataset.'
  147. )
  148. if args.hardness not in ['easy', 'hard']:
  149. raise ValueError('Please choose from easy and hard for hardness.')
  150. # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
  151. workspace_mount_path = config.workspace_mount_path
  152. pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
  153. toolqa_test = pd.DataFrame(get_data(dataset, hardness))
  154. toolqa_data_path = download_data(workspace_mount_path)
  155. toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
  156. id_column = 'qid'
  157. metadata = make_metadata(
  158. llm_config,
  159. f'toolqa-{args.dataset}-{args.hardness}',
  160. args.agent_cls,
  161. args.eval_note,
  162. args.eval_output_dir,
  163. )
  164. output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
  165. instances = prepare_dataset(toolqa_test, output_file, args.eval_n_limit, id_column)
  166. run_evaluation(
  167. instances,
  168. metadata,
  169. output_file,
  170. args.eval_num_workers,
  171. process_instance,
  172. id_column,
  173. )