summarize_outputs.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. #!/usr/bin/env python3
  2. import argparse
  3. import glob
  4. import json
  5. import os
  6. from collections import Counter
  7. import pandas as pd
  8. import random
  9. import numpy as np
  10. from openhands.events.serialization import event_from_dict
  11. from openhands.events.utils import get_pairs_from_events
  12. ERROR_KEYWORDS = [
  13. 'Agent encountered an error while processing the last action',
  14. 'APIError',
  15. 'Action execution failed',
  16. 'litellm.Timeout: APITimeoutError',
  17. ]
  18. def get_bootstrap_accuracy_error_bars(values: float | int | bool, num_samples: int = 1000, p_value=0.05) -> tuple[float, float]:
  19. sorted_vals = np.sort(
  20. [
  21. np.mean(random.sample(values, len(values) // 2))
  22. for _ in range(num_samples)
  23. ]
  24. )
  25. bottom_idx = int(num_samples * p_value / 2)
  26. top_idx = int(num_samples * (1.0 - p_value / 2))
  27. return (sorted_vals[bottom_idx], sorted_vals[top_idx])
  28. def process_file(file_path):
  29. with open(file_path, 'r') as file:
  30. lines = file.readlines()
  31. num_lines = len(lines)
  32. num_error_lines = 0
  33. num_agent_stuck_in_loop = 0
  34. num_resolved = 0
  35. resolved_arr = []
  36. num_empty_patch = 0
  37. num_unfinished_runs = 0
  38. error_counter = Counter()
  39. main_agent_cost = []
  40. editor_cost = []
  41. num_turns = []
  42. for line in lines:
  43. _d = json.loads(line)
  44. if 'metrics' not in _d or _d['metrics'] is None:
  45. # this is a failed run
  46. num_unfinished_runs += 1
  47. continue
  48. # Cost
  49. costs = _d['metrics'].get('costs', [])
  50. _cur_main_agent_cost = 0
  51. _cur_editor_cost = 0
  52. for cost in costs:
  53. if isinstance(cost, float):
  54. # backward compatible
  55. _cur_main_agent_cost += cost
  56. else:
  57. if 'draft_editor' in cost['model']:
  58. _cur_editor_cost += cost['cost']
  59. else:
  60. _cur_main_agent_cost += cost['cost']
  61. main_agent_cost.append(_cur_main_agent_cost)
  62. editor_cost.append(_cur_editor_cost)
  63. # Turn status
  64. history = _d.get('history', [])
  65. events = [event_from_dict(event) for event in history]
  66. pairs = get_pairs_from_events(events)
  67. num_turns.append(len(pairs))
  68. # Patch & resolve status
  69. patch = _d.get('test_result', {}).get('git_patch', '')
  70. if patch == '':
  71. num_empty_patch += 1
  72. continue
  73. report = _d.get('report', {}) or {}
  74. resolved = report.get('resolved', False)
  75. if resolved:
  76. num_resolved += 1
  77. resolved_arr.append(1)
  78. else:
  79. resolved_arr.append(0)
  80. # Error
  81. error = _d.get('error', None)
  82. if error is not None and isinstance(error, str):
  83. agent_stuck_in_loop = 'Agent got stuck in a loop' in error
  84. contains_error = bool(error) and not agent_stuck_in_loop
  85. if agent_stuck_in_loop:
  86. error_counter['Agent got stuck in a loop'] += 1
  87. num_agent_stuck_in_loop += 1
  88. elif contains_error:
  89. error_counter[error] += 1
  90. continue
  91. for keyword in ERROR_KEYWORDS:
  92. if keyword in line:
  93. error_counter[keyword] += 1
  94. num_error_lines += 1
  95. break
  96. return {
  97. 'file_path': file_path,
  98. 'total_instances': num_lines,
  99. 'resolved': {
  100. 'count': num_resolved,
  101. 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
  102. 'ci': tuple(x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)),
  103. },
  104. 'empty_patches': {
  105. 'count': num_empty_patch,
  106. 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
  107. },
  108. 'unfinished_runs': {
  109. 'count': num_unfinished_runs,
  110. 'percentage': (num_unfinished_runs / num_lines * 100)
  111. if num_lines > 0
  112. else 0,
  113. },
  114. 'errors': {
  115. 'total': num_error_lines,
  116. 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
  117. 'stuck_in_loop': {
  118. 'count': num_agent_stuck_in_loop,
  119. 'percentage': (num_agent_stuck_in_loop / num_lines * 100)
  120. if num_lines > 0
  121. else 0,
  122. },
  123. 'breakdown': {
  124. str(error): {
  125. 'count': count,
  126. 'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
  127. }
  128. for error, count in error_counter.items()
  129. },
  130. },
  131. 'costs': {
  132. 'main_agent': sum(main_agent_cost),
  133. 'editor': sum(editor_cost),
  134. 'total': sum(main_agent_cost) + sum(editor_cost),
  135. },
  136. 'statistics': {
  137. 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
  138. 'costs': {
  139. 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
  140. 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
  141. 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
  142. if num_lines > 0
  143. else 0,
  144. },
  145. },
  146. }
  147. def aggregate_directory(input_path) -> pd.DataFrame:
  148. # Process all output.jsonl files in subdirectories
  149. pattern = os.path.join(input_path, '**/output.jsonl')
  150. files = glob.glob(pattern, recursive=True)
  151. print(f'Processing {len(files)} files from directory {input_path}')
  152. # Process each file silently and collect results
  153. results = []
  154. for file_path in files:
  155. try:
  156. result = process_file(file_path)
  157. results.append(result)
  158. except Exception as e:
  159. print(f'Error processing {file_path}: {str(e)}')
  160. import traceback
  161. traceback.print_exc()
  162. continue
  163. # Convert results to pandas DataFrame and sort by resolve rate
  164. df = pd.DataFrame(results)
  165. # Extract directory name from file path
  166. df['directory'] = df['file_path'].apply(
  167. lambda x: os.path.basename(os.path.dirname(x))
  168. )
  169. df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
  170. df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
  171. df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
  172. df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
  173. df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
  174. df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
  175. df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
  176. df = df.sort_values('resolve_rate', ascending=False)
  177. return df
  178. if __name__ == '__main__':
  179. parser = argparse.ArgumentParser()
  180. parser.add_argument(
  181. 'input_path', type=str, help='The file or directory to summarize'
  182. )
  183. parser.add_argument(
  184. '--output',
  185. type=str,
  186. help='Output JSONL file for results',
  187. default='summary_results.jsonl',
  188. )
  189. args = parser.parse_args()
  190. if os.path.isdir(args.input_path):
  191. df = aggregate_directory(args.input_path)
  192. # Create the summary string
  193. columns = [
  194. 'directory',
  195. 'resolve_rate',
  196. 'empty_patch_rate',
  197. 'unfinished_rate',
  198. 'error_rate',
  199. 'avg_turns',
  200. 'avg_cost',
  201. 'total_instances',
  202. ]
  203. summary_str = df[columns].to_string(
  204. float_format=lambda x: '{:.2f}'.format(x),
  205. formatters={
  206. 'directory': lambda x: x[:90]
  207. }, # Truncate directory names to 20 chars
  208. index=False,
  209. )
  210. # Print to console
  211. print('\nResults summary (sorted by resolve rate):')
  212. print(summary_str)
  213. # Save to text file
  214. txt_output = args.output.rsplit('.', 1)[0] + '.txt'
  215. with open(txt_output, 'w') as f:
  216. f.write('Results summary (sorted by resolve rate):\n')
  217. f.write(summary_str)
  218. # Save
  219. df.to_json(args.output, lines=True, orient='records')
  220. df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
  221. else:
  222. # Process single file with detailed output
  223. results = []
  224. try:
  225. result = process_file(args.input_path)
  226. results.append(result)
  227. # Print detailed results for single file
  228. print(f'\nResults for {args.input_path}:')
  229. print(
  230. f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
  231. )
  232. print(
  233. f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
  234. )
  235. print(
  236. f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
  237. )
  238. print(
  239. f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
  240. )
  241. print(
  242. f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
  243. )
  244. print(f"Total cost: {result['costs']['total']:.2f} USD")
  245. print('## Statistics')
  246. print(
  247. f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
  248. )
  249. print(
  250. f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
  251. )
  252. print(
  253. f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
  254. )
  255. print(
  256. f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
  257. )
  258. print('## Detailed error breakdown:')
  259. for error, data in result['errors']['breakdown'].items():
  260. print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
  261. except Exception as e:
  262. print(f'Error processing {args.input_path}: {str(e)}')