summarize_outputs.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. #!/usr/bin/env python3
  2. import argparse
  3. import glob
  4. import json
  5. import os
  6. from collections import Counter
  7. import pandas as pd
  8. from openhands.events.serialization import event_from_dict
  9. from openhands.events.utils import get_pairs_from_events
  10. ERROR_KEYWORDS = [
  11. 'Agent encountered an error while processing the last action',
  12. 'APIError',
  13. 'Action execution failed',
  14. 'litellm.Timeout: APITimeoutError',
  15. ]
  16. def process_file(file_path):
  17. with open(file_path, 'r') as file:
  18. lines = file.readlines()
  19. num_lines = len(lines)
  20. num_error_lines = 0
  21. num_agent_stuck_in_loop = 0
  22. num_resolved = 0
  23. num_empty_patch = 0
  24. num_unfinished_runs = 0
  25. error_counter = Counter()
  26. main_agent_cost = []
  27. editor_cost = []
  28. num_turns = []
  29. for line in lines:
  30. _d = json.loads(line)
  31. if 'metrics' not in _d or _d['metrics'] is None:
  32. # this is a failed run
  33. num_unfinished_runs += 1
  34. continue
  35. # Cost
  36. costs = _d['metrics'].get('costs', [])
  37. _cur_main_agent_cost = 0
  38. _cur_editor_cost = 0
  39. for cost in costs:
  40. if isinstance(cost, float):
  41. # backward compatible
  42. _cur_main_agent_cost += cost
  43. else:
  44. if 'draft_editor' in cost['model']:
  45. _cur_editor_cost += cost['cost']
  46. else:
  47. _cur_main_agent_cost += cost['cost']
  48. main_agent_cost.append(_cur_main_agent_cost)
  49. editor_cost.append(_cur_editor_cost)
  50. # Turn status
  51. history = _d.get('history', [])
  52. events = [event_from_dict(event) for event in history]
  53. pairs = get_pairs_from_events(events)
  54. num_turns.append(len(pairs))
  55. # Patch & resolve status
  56. patch = _d.get('test_result', {}).get('git_patch', '')
  57. if patch == '':
  58. num_empty_patch += 1
  59. continue
  60. report = _d.get('report', {}) or {}
  61. resolved = report.get('resolved', False)
  62. if resolved:
  63. num_resolved += 1
  64. # Error
  65. error = _d.get('error', None)
  66. if error is not None and isinstance(error, str):
  67. agent_stuck_in_loop = 'Agent got stuck in a loop' in error
  68. contains_error = bool(error) and not agent_stuck_in_loop
  69. if agent_stuck_in_loop:
  70. error_counter['Agent got stuck in a loop'] += 1
  71. num_agent_stuck_in_loop += 1
  72. elif contains_error:
  73. error_counter[error] += 1
  74. continue
  75. for keyword in ERROR_KEYWORDS:
  76. if keyword in line:
  77. error_counter[keyword] += 1
  78. num_error_lines += 1
  79. break
  80. return {
  81. 'file_path': file_path,
  82. 'total_instances': num_lines,
  83. 'resolved': {
  84. 'count': num_resolved,
  85. 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
  86. },
  87. 'empty_patches': {
  88. 'count': num_empty_patch,
  89. 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
  90. },
  91. 'unfinished_runs': {
  92. 'count': num_unfinished_runs,
  93. 'percentage': (num_unfinished_runs / num_lines * 100)
  94. if num_lines > 0
  95. else 0,
  96. },
  97. 'errors': {
  98. 'total': num_error_lines,
  99. 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
  100. 'stuck_in_loop': {
  101. 'count': num_agent_stuck_in_loop,
  102. 'percentage': (num_agent_stuck_in_loop / num_lines * 100)
  103. if num_lines > 0
  104. else 0,
  105. },
  106. 'breakdown': {
  107. str(error): {
  108. 'count': count,
  109. 'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
  110. }
  111. for error, count in error_counter.items()
  112. },
  113. },
  114. 'statistics': {
  115. 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
  116. 'costs': {
  117. 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
  118. 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
  119. 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
  120. if num_lines > 0
  121. else 0,
  122. },
  123. },
  124. }
  125. def aggregate_directory(input_path) -> pd.DataFrame:
  126. # Process all output.jsonl files in subdirectories
  127. pattern = os.path.join(input_path, '**/output.jsonl')
  128. files = glob.glob(pattern, recursive=True)
  129. print(f'Processing {len(files)} files from directory {input_path}')
  130. # Process each file silently and collect results
  131. results = []
  132. for file_path in files:
  133. try:
  134. result = process_file(file_path)
  135. results.append(result)
  136. except Exception as e:
  137. print(f'Error processing {file_path}: {str(e)}')
  138. import traceback
  139. traceback.print_exc()
  140. continue
  141. # Convert results to pandas DataFrame and sort by resolve rate
  142. df = pd.DataFrame(results)
  143. # Extract directory name from file path
  144. df['directory'] = df['file_path'].apply(
  145. lambda x: os.path.basename(os.path.dirname(x))
  146. )
  147. df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
  148. df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
  149. df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
  150. df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
  151. df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
  152. df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
  153. df = df.sort_values('resolve_rate', ascending=False)
  154. return df
  155. if __name__ == '__main__':
  156. parser = argparse.ArgumentParser()
  157. parser.add_argument(
  158. 'input_path', type=str, help='The file or directory to summarize'
  159. )
  160. parser.add_argument(
  161. '--output',
  162. type=str,
  163. help='Output JSONL file for results',
  164. default='summary_results.jsonl',
  165. )
  166. args = parser.parse_args()
  167. if os.path.isdir(args.input_path):
  168. df = aggregate_directory(args.input_path)
  169. # Create the summary string
  170. columns = [
  171. 'directory',
  172. 'resolve_rate',
  173. 'empty_patch_rate',
  174. 'unfinished_rate',
  175. 'error_rate',
  176. 'avg_turns',
  177. 'avg_cost',
  178. 'total_instances',
  179. ]
  180. summary_str = df[columns].to_string(
  181. float_format=lambda x: '{:.2f}'.format(x),
  182. formatters={
  183. 'directory': lambda x: x[:90]
  184. }, # Truncate directory names to 20 chars
  185. index=False,
  186. )
  187. # Print to console
  188. print('\nResults summary (sorted by resolve rate):')
  189. print(summary_str)
  190. # Save to text file
  191. txt_output = args.output.rsplit('.', 1)[0] + '.txt'
  192. with open(txt_output, 'w') as f:
  193. f.write('Results summary (sorted by resolve rate):\n')
  194. f.write(summary_str)
  195. # Save
  196. df.to_json(args.output, lines=True, orient='records')
  197. df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
  198. else:
  199. # Process single file with detailed output
  200. results = []
  201. try:
  202. result = process_file(args.input_path)
  203. results.append(result)
  204. # Print detailed results for single file
  205. print(f'\nResults for {args.input_path}:')
  206. print(
  207. f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
  208. )
  209. print(
  210. f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
  211. )
  212. print(
  213. f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
  214. )
  215. print(
  216. f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
  217. )
  218. print(
  219. f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
  220. )
  221. print('## Statistics')
  222. print(
  223. f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
  224. )
  225. print(
  226. f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
  227. )
  228. print(
  229. f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
  230. )
  231. print(
  232. f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
  233. )
  234. print('## Detailed error breakdown:')
  235. for error, data in result['errors']['breakdown'].items():
  236. print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
  237. except Exception as e:
  238. print(f'Error processing {args.input_path}: {str(e)}')