summarize_outputs.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. #!/usr/bin/env python3
  2. import argparse
  3. import glob
  4. import json
  5. import os
  6. import random
  7. from collections import Counter
  8. import numpy as np
  9. import pandas as pd
  10. from openhands.events.serialization import event_from_dict
  11. from openhands.events.utils import get_pairs_from_events
  12. ERROR_KEYWORDS = [
  13. 'Agent encountered an error while processing the last action',
  14. 'APIError',
  15. 'Action execution failed',
  16. 'litellm.Timeout: APITimeoutError',
  17. ]
  18. def get_bootstrap_accuracy_error_bars(
  19. values: float | int | bool, num_samples: int = 1000, p_value=0.05
  20. ) -> tuple[float, float]:
  21. sorted_vals = np.sort(
  22. [np.mean(random.sample(values, len(values) // 2)) for _ in range(num_samples)]
  23. )
  24. bottom_idx = int(num_samples * p_value / 2)
  25. top_idx = int(num_samples * (1.0 - p_value / 2))
  26. return (sorted_vals[bottom_idx], sorted_vals[top_idx])
  27. def process_file(file_path):
  28. with open(file_path, 'r') as file:
  29. lines = file.readlines()
  30. num_lines = len(lines)
  31. num_error_lines = 0
  32. num_agent_stuck_in_loop = 0
  33. num_resolved = 0
  34. resolved_arr = []
  35. num_empty_patch = 0
  36. num_unfinished_runs = 0
  37. error_counter = Counter()
  38. main_agent_cost = []
  39. editor_cost = []
  40. num_turns = []
  41. for line in lines:
  42. _d = json.loads(line)
  43. if 'metrics' not in _d or _d['metrics'] is None:
  44. # this is a failed run
  45. num_unfinished_runs += 1
  46. continue
  47. # Cost
  48. costs = _d['metrics'].get('costs', [])
  49. _cur_main_agent_cost = 0
  50. _cur_editor_cost = 0
  51. for cost in costs:
  52. if isinstance(cost, float):
  53. # backward compatible
  54. _cur_main_agent_cost += cost
  55. else:
  56. if 'draft_editor' in cost['model']:
  57. _cur_editor_cost += cost['cost']
  58. else:
  59. _cur_main_agent_cost += cost['cost']
  60. main_agent_cost.append(_cur_main_agent_cost)
  61. editor_cost.append(_cur_editor_cost)
  62. # Turn status
  63. history = _d.get('history', [])
  64. events = [event_from_dict(event) for event in history]
  65. pairs = get_pairs_from_events(events)
  66. num_turns.append(len(pairs))
  67. # Patch & resolve status
  68. patch = _d.get('test_result', {}).get('git_patch', '')
  69. if patch == '':
  70. num_empty_patch += 1
  71. continue
  72. report = _d.get('report', {}) or {}
  73. resolved = report.get('resolved', False)
  74. if resolved:
  75. num_resolved += 1
  76. resolved_arr.append(1)
  77. else:
  78. resolved_arr.append(0)
  79. # Error
  80. error = _d.get('error', None)
  81. if error is not None and isinstance(error, str):
  82. agent_stuck_in_loop = 'Agent got stuck in a loop' in error
  83. contains_error = bool(error) and not agent_stuck_in_loop
  84. if agent_stuck_in_loop:
  85. error_counter['Agent got stuck in a loop'] += 1
  86. num_agent_stuck_in_loop += 1
  87. elif contains_error:
  88. error_counter[error] += 1
  89. continue
  90. for keyword in ERROR_KEYWORDS:
  91. if keyword in line:
  92. error_counter[keyword] += 1
  93. num_error_lines += 1
  94. break
  95. return {
  96. 'file_path': file_path,
  97. 'total_instances': num_lines,
  98. 'resolved': {
  99. 'count': num_resolved,
  100. 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
  101. 'ci': tuple(
  102. x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)
  103. ),
  104. },
  105. 'empty_patches': {
  106. 'count': num_empty_patch,
  107. 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
  108. },
  109. 'unfinished_runs': {
  110. 'count': num_unfinished_runs,
  111. 'percentage': (num_unfinished_runs / num_lines * 100)
  112. if num_lines > 0
  113. else 0,
  114. },
  115. 'errors': {
  116. 'total': num_error_lines,
  117. 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
  118. 'stuck_in_loop': {
  119. 'count': num_agent_stuck_in_loop,
  120. 'percentage': (num_agent_stuck_in_loop / num_lines * 100)
  121. if num_lines > 0
  122. else 0,
  123. },
  124. 'breakdown': {
  125. str(error): {
  126. 'count': count,
  127. 'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
  128. }
  129. for error, count in error_counter.items()
  130. },
  131. },
  132. 'costs': {
  133. 'main_agent': sum(main_agent_cost),
  134. 'editor': sum(editor_cost),
  135. 'total': sum(main_agent_cost) + sum(editor_cost),
  136. },
  137. 'statistics': {
  138. 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
  139. 'costs': {
  140. 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
  141. 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
  142. 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
  143. if num_lines > 0
  144. else 0,
  145. },
  146. },
  147. }
  148. def aggregate_directory(input_path) -> pd.DataFrame:
  149. # Process all output.jsonl files in subdirectories
  150. pattern = os.path.join(input_path, '**/output.jsonl')
  151. files = glob.glob(pattern, recursive=True)
  152. print(f'Processing {len(files)} files from directory {input_path}')
  153. # Process each file silently and collect results
  154. results = []
  155. for file_path in files:
  156. try:
  157. result = process_file(file_path)
  158. results.append(result)
  159. except Exception as e:
  160. print(f'Error processing {file_path}: {str(e)}')
  161. import traceback
  162. traceback.print_exc()
  163. continue
  164. # Convert results to pandas DataFrame and sort by resolve rate
  165. df = pd.DataFrame(results)
  166. # Extract directory name from file path
  167. df['directory'] = df['file_path'].apply(
  168. lambda x: os.path.basename(os.path.dirname(x))
  169. )
  170. df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
  171. df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
  172. df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
  173. df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
  174. df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
  175. df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
  176. df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
  177. df = df.sort_values('resolve_rate', ascending=False)
  178. return df
  179. if __name__ == '__main__':
  180. parser = argparse.ArgumentParser()
  181. parser.add_argument(
  182. 'input_path', type=str, help='The file or directory to summarize'
  183. )
  184. parser.add_argument(
  185. '--output',
  186. type=str,
  187. help='Output JSONL file for results',
  188. default='summary_results.jsonl',
  189. )
  190. args = parser.parse_args()
  191. if os.path.isdir(args.input_path):
  192. df = aggregate_directory(args.input_path)
  193. # Create the summary string
  194. columns = [
  195. 'directory',
  196. 'resolve_rate',
  197. 'empty_patch_rate',
  198. 'unfinished_rate',
  199. 'error_rate',
  200. 'avg_turns',
  201. 'avg_cost',
  202. 'total_instances',
  203. ]
  204. summary_str = df[columns].to_string(
  205. float_format=lambda x: '{:.2f}'.format(x),
  206. formatters={
  207. 'directory': lambda x: x[:90]
  208. }, # Truncate directory names to 20 chars
  209. index=False,
  210. )
  211. # Print to console
  212. print('\nResults summary (sorted by resolve rate):')
  213. print(summary_str)
  214. # Save to text file
  215. txt_output = args.output.rsplit('.', 1)[0] + '.txt'
  216. with open(txt_output, 'w') as f:
  217. f.write('Results summary (sorted by resolve rate):\n')
  218. f.write(summary_str)
  219. # Save
  220. df.to_json(args.output, lines=True, orient='records')
  221. df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
  222. else:
  223. # Process single file with detailed output
  224. results = []
  225. try:
  226. result = process_file(args.input_path)
  227. results.append(result)
  228. # Print detailed results for single file
  229. print(f'\nResults for {args.input_path}:')
  230. print(
  231. f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
  232. )
  233. print(
  234. f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
  235. )
  236. print(
  237. f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
  238. )
  239. print(
  240. f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
  241. )
  242. print(
  243. f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
  244. )
  245. print(f"Total cost: {result['costs']['total']:.2f} USD")
  246. print('## Statistics')
  247. print(
  248. f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
  249. )
  250. print(
  251. f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
  252. )
  253. print(
  254. f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
  255. )
  256. print(
  257. f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
  258. )
  259. print('## Detailed error breakdown:')
  260. for error, data in result['errors']['breakdown'].items():
  261. print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
  262. except Exception as e:
  263. print(f'Error processing {args.input_path}: {str(e)}')