shared.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. import json
  2. import multiprocessing as mp
  3. import os
  4. import pathlib
  5. import subprocess
  6. import time
  7. from asyncio.log import logger
  8. from concurrent.futures import ProcessPoolExecutor
  9. from typing import Any, Callable
  10. import pandas as pd
  11. from pydantic import BaseModel
  12. from tqdm import tqdm
  13. from opendevin.controller.state.state import State
  14. from opendevin.core.config import LLMConfig
  15. from opendevin.events.action import Action
  16. from opendevin.events.action.message import MessageAction
  17. class EvalMetadata(BaseModel):
  18. agent_class: str
  19. llm_config: LLMConfig
  20. max_iterations: int
  21. eval_output_dir: str
  22. start_time: str
  23. git_commit: str
  24. dataset: str | None = None
  25. data_split: str | None = None
  26. details: dict[str, Any] | None = None
  27. def model_dump_json(self, *args, **kwargs):
  28. dumped = super().model_dump_json(*args, **kwargs)
  29. dumped_dict = json.loads(dumped)
  30. logger.debug(f'Dumped metadata: {dumped_dict}')
  31. # avoid leaking sensitive information
  32. dumped_dict['llm_config'] = self.llm_config.to_safe_dict()
  33. return json.dumps(dumped_dict)
  34. def codeact_user_response(
  35. state: State,
  36. encapsulate_solution: bool = False,
  37. try_parse: Callable[[Action], str] | None = None,
  38. ) -> str:
  39. encaps_str = (
  40. (
  41. 'Please encapsulate your final answer (answer ONLY) within <solution> and </solution>.\n'
  42. 'For example: The answer to the question is <solution> 42 </solution>.\n'
  43. )
  44. if encapsulate_solution
  45. else ''
  46. )
  47. msg = (
  48. 'Please continue working on the task on whatever approach you think is suitable.\n'
  49. 'If you think you have solved the task, please first send your answer to user through message and then <execute_bash> exit </execute_bash>.\n'
  50. f'{encaps_str}'
  51. 'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP.\n'
  52. )
  53. if state.history:
  54. # check if the last action has an answer, if so, early exit
  55. if try_parse is not None:
  56. last_action = state.history.get_last_action()
  57. ans = try_parse(last_action)
  58. if ans is not None:
  59. return '/exit'
  60. # check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
  61. user_msgs = [
  62. event
  63. for event in state.history.get_events()
  64. if isinstance(event, MessageAction) and event.source == 'user'
  65. ]
  66. if len(user_msgs) >= 2:
  67. # let the agent know that it can give up when it has tried 3 times
  68. return (
  69. msg
  70. + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
  71. )
  72. return msg
  73. def cleanup():
  74. print('Cleaning up child processes...')
  75. for process in mp.active_children():
  76. print(f'Terminating child process: {process.name}')
  77. process.terminate()
  78. process.join()
  79. def make_metadata(
  80. llm_config: LLMConfig,
  81. dataset_name: str,
  82. agent_class: str,
  83. max_iterations: int,
  84. eval_note: str | None,
  85. eval_output_dir: str,
  86. data_split: str | None = None,
  87. details: dict[str, Any] | None = None,
  88. ) -> EvalMetadata:
  89. model_name = llm_config.model.split('/')[-1]
  90. eval_note = f'_N_{eval_note}' if eval_note else ''
  91. eval_output_path = os.path.join(
  92. eval_output_dir,
  93. dataset_name,
  94. agent_class,
  95. f'{model_name}_maxiter_{max_iterations}{eval_note}',
  96. )
  97. pathlib.Path(eval_output_path).mkdir(parents=True, exist_ok=True)
  98. pathlib.Path(os.path.join(eval_output_path, 'logs')).mkdir(
  99. parents=True, exist_ok=True
  100. )
  101. logger.info(f'Using evaluation output directory: {eval_output_path}')
  102. metadata = EvalMetadata(
  103. agent_class=agent_class,
  104. llm_config=llm_config,
  105. max_iterations=max_iterations,
  106. eval_output_dir=eval_output_path,
  107. start_time=time.strftime('%Y-%m-%d %H:%M:%S'),
  108. git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
  109. .decode('utf-8')
  110. .strip(),
  111. dataset=dataset_name,
  112. data_split=data_split,
  113. details=details,
  114. )
  115. metadata_json = metadata.model_dump_json()
  116. logger.info(f'Metadata: {metadata_json}')
  117. with open(os.path.join(eval_output_path, 'metadata.json'), 'w') as f:
  118. f.write(metadata_json)
  119. return metadata
  120. def prepare_dataset(dataset: pd.DataFrame, output_file, eval_n_limit, id_column):
  121. logger.info(f'Writing evaluation output to {output_file}')
  122. finished_ids = set()
  123. if os.path.exists(output_file):
  124. with open(output_file, 'r') as f:
  125. for line in f:
  126. data = json.loads(line)
  127. finished_ids.add(data[id_column])
  128. logger.warning(
  129. f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
  130. )
  131. if eval_n_limit:
  132. dataset = dataset.head(eval_n_limit)
  133. logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
  134. new_dataset = [
  135. instance
  136. for _, instance in dataset.iterrows()
  137. if instance[id_column] not in finished_ids
  138. ]
  139. logger.info(
  140. f'Finished instances: {len(finished_ids)}, Remaining instances: {len(new_dataset)}'
  141. )
  142. return pd.DataFrame(new_dataset)
  143. def run_evaluation(
  144. dataset: pd.DataFrame,
  145. metadata: EvalMetadata,
  146. output_file: str,
  147. num_workers: int,
  148. process_instance_func: Callable[[pd.Series, EvalMetadata, bool], Any],
  149. id_column: str,
  150. ):
  151. logger.info(
  152. f'Evaluation started with Agent {metadata.agent_class}, '
  153. f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.'
  154. )
  155. pbar = tqdm(total=len(dataset))
  156. output_fp = open(output_file, 'a')
  157. def update_progress(future):
  158. pbar.update(1)
  159. output = future.result()
  160. pbar.set_description(f'Instance {output[id_column]}')
  161. pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
  162. logger.info(
  163. f'Finished evaluation for instance {output[id_column]}: {output["test_result"]["result"]}'
  164. )
  165. output_fp.write(json.dumps(output) + '\n')
  166. output_fp.flush()
  167. try:
  168. with ProcessPoolExecutor(num_workers) as executor:
  169. futures = []
  170. for _, instance in dataset.iterrows():
  171. future = executor.submit(
  172. process_instance_func,
  173. instance,
  174. metadata,
  175. bool(num_workers > 1),
  176. )
  177. future.add_done_callback(update_progress)
  178. futures.append(future)
  179. for future in futures:
  180. future.result()
  181. except KeyboardInterrupt:
  182. print('KeyboardInterrupt received. Cleaning up...')
  183. cleanup()
  184. output_fp.close()
  185. logger.info('Evaluation finished.')