summarize_outputs.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #!/usr/bin/env python3
  2. import argparse
  3. import glob
  4. import json
  5. import os
  6. from collections import Counter
  7. import pandas as pd
  8. from openhands.events.serialization import event_from_dict
  9. from openhands.events.utils import get_pairs_from_events
  10. ERROR_KEYWORDS = [
  11. 'Agent encountered an error while processing the last action',
  12. 'APIError',
  13. 'Action execution failed',
  14. 'litellm.Timeout: APITimeoutError',
  15. ]
  16. def process_file(file_path):
  17. with open(file_path, 'r') as file:
  18. lines = file.readlines()
  19. num_lines = len(lines)
  20. num_error_lines = 0
  21. num_agent_stuck_in_loop = 0
  22. num_resolved = 0
  23. num_empty_patch = 0
  24. num_unfinished_runs = 0
  25. error_counter = Counter()
  26. main_agent_cost = []
  27. editor_cost = []
  28. num_turns = []
  29. for line in lines:
  30. _d = json.loads(line)
  31. if 'metrics' not in _d or _d['metrics'] is None:
  32. # this is a failed run
  33. num_unfinished_runs += 1
  34. continue
  35. # Cost
  36. costs = _d['metrics'].get('costs', [])
  37. _cur_main_agent_cost = 0
  38. _cur_editor_cost = 0
  39. for cost in costs:
  40. if isinstance(cost, float):
  41. # backward compatible
  42. _cur_main_agent_cost += cost
  43. else:
  44. if 'draft_editor' in cost['model']:
  45. _cur_editor_cost += cost['cost']
  46. else:
  47. _cur_main_agent_cost += cost['cost']
  48. main_agent_cost.append(_cur_main_agent_cost)
  49. editor_cost.append(_cur_editor_cost)
  50. # Turn status
  51. history = _d.get('history', [])
  52. events = [event_from_dict(event) for event in history]
  53. pairs = get_pairs_from_events(events)
  54. num_turns.append(len(pairs))
  55. # Patch & resolve status
  56. patch = _d.get('test_result', {}).get('git_patch', '')
  57. if patch == '':
  58. num_empty_patch += 1
  59. continue
  60. report = _d.get('report', {}) or {}
  61. resolved = report.get('resolved', False)
  62. if resolved:
  63. num_resolved += 1
  64. # Error
  65. error = _d.get('error', None)
  66. if error is not None and isinstance(error, str):
  67. agent_stuck_in_loop = 'Agent got stuck in a loop' in error
  68. contains_error = bool(error) and not agent_stuck_in_loop
  69. if agent_stuck_in_loop:
  70. error_counter['Agent got stuck in a loop'] += 1
  71. num_agent_stuck_in_loop += 1
  72. elif contains_error:
  73. error_counter[error] += 1
  74. continue
  75. for keyword in ERROR_KEYWORDS:
  76. if keyword in line:
  77. error_counter[keyword] += 1
  78. num_error_lines += 1
  79. break
  80. return {
  81. 'file_path': file_path,
  82. 'total_instances': num_lines,
  83. 'resolved': {
  84. 'count': num_resolved,
  85. 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
  86. },
  87. 'empty_patches': {
  88. 'count': num_empty_patch,
  89. 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
  90. },
  91. 'unfinished_runs': {
  92. 'count': num_unfinished_runs,
  93. 'percentage': (num_unfinished_runs / num_lines * 100)
  94. if num_lines > 0
  95. else 0,
  96. },
  97. 'errors': {
  98. 'total': num_error_lines,
  99. 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
  100. 'stuck_in_loop': {
  101. 'count': num_agent_stuck_in_loop,
  102. 'percentage': (num_agent_stuck_in_loop / num_lines * 100)
  103. if num_lines > 0
  104. else 0,
  105. },
  106. 'breakdown': {
  107. str(error): {
  108. 'count': count,
  109. 'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
  110. }
  111. for error, count in error_counter.items()
  112. },
  113. },
  114. 'costs': {
  115. 'main_agent': sum(main_agent_cost),
  116. 'editor': sum(editor_cost),
  117. 'total': sum(main_agent_cost) + sum(editor_cost),
  118. },
  119. 'statistics': {
  120. 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
  121. 'costs': {
  122. 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
  123. 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
  124. 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
  125. if num_lines > 0
  126. else 0,
  127. },
  128. },
  129. }
  130. def aggregate_directory(input_path) -> pd.DataFrame:
  131. # Process all output.jsonl files in subdirectories
  132. pattern = os.path.join(input_path, '**/output.jsonl')
  133. files = glob.glob(pattern, recursive=True)
  134. print(f'Processing {len(files)} files from directory {input_path}')
  135. # Process each file silently and collect results
  136. results = []
  137. for file_path in files:
  138. try:
  139. result = process_file(file_path)
  140. results.append(result)
  141. except Exception as e:
  142. print(f'Error processing {file_path}: {str(e)}')
  143. import traceback
  144. traceback.print_exc()
  145. continue
  146. # Convert results to pandas DataFrame and sort by resolve rate
  147. df = pd.DataFrame(results)
  148. # Extract directory name from file path
  149. df['directory'] = df['file_path'].apply(
  150. lambda x: os.path.basename(os.path.dirname(x))
  151. )
  152. df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
  153. df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
  154. df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
  155. df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
  156. df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
  157. df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
  158. df = df.sort_values('resolve_rate', ascending=False)
  159. return df
  160. if __name__ == '__main__':
  161. parser = argparse.ArgumentParser()
  162. parser.add_argument(
  163. 'input_path', type=str, help='The file or directory to summarize'
  164. )
  165. parser.add_argument(
  166. '--output',
  167. type=str,
  168. help='Output JSONL file for results',
  169. default='summary_results.jsonl',
  170. )
  171. args = parser.parse_args()
  172. if os.path.isdir(args.input_path):
  173. df = aggregate_directory(args.input_path)
  174. # Create the summary string
  175. columns = [
  176. 'directory',
  177. 'resolve_rate',
  178. 'empty_patch_rate',
  179. 'unfinished_rate',
  180. 'error_rate',
  181. 'avg_turns',
  182. 'avg_cost',
  183. 'total_instances',
  184. ]
  185. summary_str = df[columns].to_string(
  186. float_format=lambda x: '{:.2f}'.format(x),
  187. formatters={
  188. 'directory': lambda x: x[:90]
  189. }, # Truncate directory names to 20 chars
  190. index=False,
  191. )
  192. # Print to console
  193. print('\nResults summary (sorted by resolve rate):')
  194. print(summary_str)
  195. # Save to text file
  196. txt_output = args.output.rsplit('.', 1)[0] + '.txt'
  197. with open(txt_output, 'w') as f:
  198. f.write('Results summary (sorted by resolve rate):\n')
  199. f.write(summary_str)
  200. # Save
  201. df.to_json(args.output, lines=True, orient='records')
  202. df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
  203. else:
  204. # Process single file with detailed output
  205. results = []
  206. try:
  207. result = process_file(args.input_path)
  208. results.append(result)
  209. # Print detailed results for single file
  210. print(f'\nResults for {args.input_path}:')
  211. print(
  212. f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
  213. )
  214. print(
  215. f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
  216. )
  217. print(
  218. f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
  219. )
  220. print(
  221. f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
  222. )
  223. print(
  224. f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
  225. )
  226. print(f"Total cost: {result['costs']['total']:.2f} USD")
  227. print('## Statistics')
  228. print(
  229. f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
  230. )
  231. print(
  232. f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
  233. )
  234. print(
  235. f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
  236. )
  237. print(
  238. f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
  239. )
  240. print('## Detailed error breakdown:')
  241. for error, data in result['errors']['breakdown'].items():
  242. print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
  243. except Exception as e:
  244. print(f'Error processing {args.input_path}: {str(e)}')