eval_infer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. import os
  2. import tempfile
  3. import time
  4. import pandas as pd
  5. from pydantic import BaseModel
  6. from swebench.harness.grading import get_eval_report
  7. from swebench.harness.run_evaluation import (
  8. APPLY_PATCH_FAIL,
  9. APPLY_PATCH_PASS,
  10. )
  11. from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
  12. from swebench.harness.utils import load_swebench_dataset
  13. from evaluation.swe_bench.run_infer import get_instance_docker_image
  14. from evaluation.utils.shared import (
  15. EvalMetadata,
  16. EvalOutput,
  17. prepare_dataset,
  18. reset_logger_for_multiprocessing,
  19. run_evaluation,
  20. )
  21. from openhands.core.config import (
  22. AppConfig,
  23. SandboxConfig,
  24. get_parser,
  25. )
  26. from openhands.core.logger import openhands_logger as logger
  27. from openhands.core.main import create_runtime
  28. from openhands.events.action import CmdRunAction
  29. from openhands.events.observation import CmdOutputObservation
  30. # TODO: migrate all swe-bench docker to ghcr.io/openhands
  31. DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
  32. logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
  33. def get_config(instance: pd.Series) -> AppConfig:
  34. # We use a different instance image for the each instance of swe-bench eval
  35. base_container_image = get_instance_docker_image(instance['instance_id'])
  36. logger.info(
  37. f'Using instance container image: {base_container_image}. '
  38. f'Please make sure this image exists. '
  39. f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
  40. )
  41. config = AppConfig(
  42. run_as_openhands=False,
  43. runtime=os.environ.get('RUNTIME', 'eventstream'),
  44. sandbox=SandboxConfig(
  45. base_container_image=base_container_image,
  46. use_host_network=False,
  47. # large enough timeout, since some testcases take very long to run
  48. timeout=1800,
  49. api_key=os.environ.get('ALLHANDS_API_KEY', None),
  50. ),
  51. # do not mount workspace
  52. workspace_base=None,
  53. workspace_mount_path=None,
  54. )
  55. return config
  56. class SWEBenchEvalResult(BaseModel):
  57. instance_id: str
  58. apply_patch_output: str
  59. test_output: str
  60. resolved: bool
  61. def process_instance(
  62. instance: pd.Series,
  63. metadata: EvalMetadata | None = None,
  64. reset_logger: bool = True,
  65. ) -> EvalOutput:
  66. # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
  67. if reset_logger:
  68. global output_file
  69. log_dir = output_file.replace('.jsonl', '.logs')
  70. os.makedirs(log_dir, exist_ok=True)
  71. reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
  72. else:
  73. logger.info(f'Starting evaluation for instance {instance.instance_id}.')
  74. config = get_config(instance)
  75. instance_id = instance.instance_id
  76. model_patch = instance['model_patch']
  77. test_spec: TestSpec = instance['test_spec']
  78. logger.info(f'Starting evaluation for instance {instance_id}.')
  79. if 'test_result' not in instance.keys():
  80. instance['test_result'] = {}
  81. instance['test_result']['report'] = {
  82. 'empty_generation': False,
  83. 'resolved': False,
  84. 'failed_apply_patch': False,
  85. 'error_eval': False,
  86. }
  87. if model_patch == '':
  88. instance['test_result']['report']['empty_generation'] = True
  89. return EvalOutput(
  90. instance_id=instance_id,
  91. test_result=instance['test_result'],
  92. )
  93. runtime = create_runtime(config, sid=instance_id)
  94. # Get patch and save it to /tmp/patch.diff
  95. with tempfile.TemporaryDirectory() as temp_dir:
  96. # Patch file
  97. patch_file_path = os.path.join(temp_dir, 'patch.diff')
  98. with open(patch_file_path, 'w') as f:
  99. f.write(model_patch)
  100. runtime.copy_to(patch_file_path, '/tmp')
  101. # Eval script
  102. eval_script_path = os.path.join(temp_dir, 'eval.sh')
  103. with open(eval_script_path, 'w') as f:
  104. f.write(test_spec.eval_script)
  105. runtime.copy_to(eval_script_path, '/tmp')
  106. # Set +x
  107. action = CmdRunAction(command='chmod +x /tmp/eval.sh')
  108. action.timeout = 600
  109. logger.info(action, extra={'msg_type': 'ACTION'})
  110. obs = runtime.run_action(action)
  111. logger.info(obs, extra={'msg_type': 'OBSERVATION'})
  112. assert obs.exit_code == 0
  113. # Apply patch
  114. exec_command = (
  115. 'cd /testbed && '
  116. "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
  117. "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
  118. "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
  119. "echo 'APPLY_PATCH_FAIL')))"
  120. )
  121. action = CmdRunAction(command=exec_command, keep_prompt=False)
  122. action.timeout = 600
  123. obs = runtime.run_action(action)
  124. assert isinstance(obs, CmdOutputObservation)
  125. apply_patch_output = obs.content
  126. assert isinstance(apply_patch_output, str)
  127. instance['test_result']['apply_patch_output'] = apply_patch_output
  128. try:
  129. if 'APPLY_PATCH_FAIL' in apply_patch_output:
  130. logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
  131. instance['test_result']['report']['failed_apply_patch'] = True
  132. return EvalOutput(
  133. instance_id=instance_id,
  134. test_result=instance['test_result'],
  135. )
  136. elif 'APPLY_PATCH_PASS' in apply_patch_output:
  137. logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
  138. # Run eval script in background and save output to log file
  139. log_file = '/tmp/eval_output.log'
  140. action = CmdRunAction(
  141. command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
  142. )
  143. action.timeout = 60 # Short timeout just to get the process ID
  144. obs = runtime.run_action(action)
  145. if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
  146. pid = obs.content.split()[-1].strip()
  147. logger.info(
  148. f'[{instance_id}] Evaluation process started with PID: {pid}'
  149. )
  150. # Poll for completion
  151. start_time = time.time()
  152. timeout = 900 # 15 minutes
  153. while True:
  154. seconds_elapsed = time.time() - start_time
  155. if seconds_elapsed > timeout:
  156. logger.info(
  157. f'[{instance_id}] Evaluation timed out after {timeout} seconds'
  158. )
  159. break
  160. check_action = CmdRunAction(
  161. command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
  162. )
  163. check_action.timeout = 60
  164. check_obs = runtime.run_action(check_action)
  165. if (
  166. isinstance(check_obs, CmdOutputObservation)
  167. and check_obs.content.split()[-1].strip() == '1'
  168. ):
  169. logger.info(
  170. f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
  171. )
  172. break
  173. logger.info(
  174. f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
  175. )
  176. time.sleep(30) # Wait for 30 seconds before checking again
  177. # Read the log file
  178. cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
  179. cat_action.timeout = 300
  180. cat_obs = runtime.run_action(cat_action)
  181. # Grade answer
  182. if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
  183. test_output = cat_obs.content
  184. assert isinstance(test_output, str)
  185. instance['test_result']['test_output'] = test_output
  186. # Get report from test output
  187. logger.info(f'[{instance_id}] Grading answer...')
  188. with tempfile.TemporaryDirectory() as temp_dir:
  189. # Create a directory structure that matches the expected format
  190. # NOTE: this is a hack to make the eval report format consistent
  191. # with the original SWE-Bench eval script
  192. log_dir = os.path.join(temp_dir, 'logs', instance_id)
  193. os.makedirs(log_dir, exist_ok=True)
  194. test_output_path = os.path.join(log_dir, 'test_output.txt')
  195. with open(test_output_path, 'w') as f:
  196. f.write(test_output)
  197. _report = get_eval_report(
  198. test_spec=test_spec,
  199. prediction={
  200. 'model_patch': model_patch,
  201. 'instance_id': instance_id,
  202. },
  203. log_path=test_output_path,
  204. include_tests_status=True,
  205. )
  206. report = _report[instance_id]
  207. logger.info(
  208. f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
  209. )
  210. instance['test_result']['report']['resolved'] = report[
  211. 'resolved'
  212. ]
  213. else:
  214. logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
  215. instance['test_result']['report']['error_eval'] = True
  216. return EvalOutput(
  217. instance_id=instance_id,
  218. test_result=instance['test_result'],
  219. )
  220. else:
  221. logger.info(
  222. f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
  223. )
  224. raise RuntimeError(
  225. instance_id,
  226. f'Unexpected output when applying patch:\n{apply_patch_output}',
  227. logger,
  228. )
  229. finally:
  230. runtime.close()
  231. if __name__ == '__main__':
  232. parser = get_parser()
  233. parser.add_argument(
  234. '--input-file',
  235. type=str,
  236. help='Path to input predictions file',
  237. required=True,
  238. )
  239. parser.add_argument(
  240. '--dataset',
  241. type=str,
  242. default='princeton-nlp/SWE-bench',
  243. help='data set to evaluate on, either full-test or lite-test',
  244. )
  245. parser.add_argument(
  246. '--split',
  247. type=str,
  248. default='test',
  249. help='split to evaluate on',
  250. )
  251. args, _ = parser.parse_known_args()
  252. # Load SWE-Bench dataset
  253. full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
  254. args.dataset, args.split
  255. )
  256. instance_id_to_instance = {
  257. instance['instance_id']: instance for instance in full_dataset
  258. }
  259. logger.info(
  260. f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
  261. )
  262. # Load predictions
  263. assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
  264. predictions = pd.read_json(args.input_file, lines=True)
  265. assert (
  266. 'instance_id' in predictions.columns
  267. ), 'Input file must contain instance_id column.'
  268. if 'model_patch' not in predictions.columns and (
  269. 'test_result' in predictions.columns
  270. and 'model_patch' in predictions['test_result'].iloc[0]
  271. ):
  272. raise ValueError(
  273. 'Input file must contain model_patch column OR test_result column with model_patch field.'
  274. )
  275. assert len(predictions['instance_id'].unique()) == len(
  276. predictions
  277. ), 'instance_id column must be unique.'
  278. if 'model_patch' not in predictions.columns:
  279. predictions['model_patch'] = predictions['test_result'].apply(
  280. lambda x: x['git_patch']
  281. )
  282. assert {'instance_id', 'model_patch'}.issubset(
  283. set(predictions.columns)
  284. ), 'Input file must contain instance_id and model_patch columns.'
  285. # Merge predictions with dataset
  286. predictions['instance'] = predictions['instance_id'].apply(
  287. lambda x: instance_id_to_instance[x]
  288. )
  289. predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
  290. # Prepare dataset
  291. output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
  292. instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
  293. run_evaluation(
  294. instances,
  295. metadata=None,
  296. output_file=output_file,
  297. num_workers=args.eval_num_workers,
  298. process_instance_func=process_instance,
  299. )
  300. # Load evaluated predictions & print number of resolved predictions
  301. evaluated_predictions = pd.read_json(output_file, lines=True)
  302. fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
  303. def count_report_field(row, field):
  304. return row['test_result']['report'][field]
  305. for field in fields:
  306. count = evaluated_predictions.apply(
  307. count_report_field, args=(field,), axis=1
  308. ).sum()
  309. logger.info(
  310. f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
  311. )