summarize_outputs.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python3
  2. import argparse
  3. import json
  4. from collections import Counter
  5. from openhands.events.serialization import event_from_dict
  6. from openhands.events.utils import get_pairs_from_events
  7. ERROR_KEYWORDS = [
  8. 'Agent encountered an error while processing the last action',
  9. 'APIError',
  10. 'Action execution failed',
  11. ]
  12. if __name__ == '__main__':
  13. parser = argparse.ArgumentParser()
  14. parser.add_argument('output_file', type=str, help='The file to summarize')
  15. args = parser.parse_args()
  16. with open(args.output_file, 'r') as file:
  17. lines = file.readlines()
  18. num_lines = len(lines)
  19. num_error_lines = 0
  20. num_agent_stuck_in_loop = 0
  21. num_resolved = 0
  22. num_empty_patch = 0
  23. error_counter = Counter()
  24. main_agent_cost = []
  25. editor_cost = []
  26. num_turns = []
  27. for line in lines:
  28. _d = json.loads(line)
  29. # Cost
  30. costs = _d['metrics'].get('costs', [])
  31. _cur_main_agent_cost = 0
  32. _cur_editor_cost = 0
  33. for cost in costs:
  34. if isinstance(cost, float):
  35. # backward compatible
  36. _cur_main_agent_cost += cost
  37. else:
  38. if 'draft_editor' in cost['model']:
  39. _cur_editor_cost += cost['cost']
  40. else:
  41. _cur_main_agent_cost += cost['cost']
  42. main_agent_cost.append(_cur_main_agent_cost)
  43. editor_cost.append(_cur_editor_cost)
  44. # Turn status
  45. history = _d.get('history', [])
  46. events = [event_from_dict(event) for event in history]
  47. pairs = get_pairs_from_events(events)
  48. num_turns.append(len(pairs))
  49. # Patch & resolve status
  50. patch = _d.get('test_result', {}).get('git_patch', '')
  51. if patch == '':
  52. num_empty_patch += 1
  53. continue
  54. report = _d.get('report', {}) or {}
  55. resolved = report.get('resolved', False)
  56. if resolved:
  57. num_resolved += 1
  58. # Error
  59. error = _d.get('error', None)
  60. if error is not None and isinstance(error, str):
  61. agent_stuck_in_loop = 'Agent got stuck in a loop' in error
  62. contains_error = bool(error) and not agent_stuck_in_loop
  63. if agent_stuck_in_loop:
  64. error_counter['Agent got stuck in a loop'] += 1
  65. num_agent_stuck_in_loop += 1
  66. elif contains_error:
  67. error_counter[error] += 1
  68. continue
  69. for keyword in ERROR_KEYWORDS:
  70. if keyword in line:
  71. error_counter[keyword] += 1
  72. num_error_lines += 1
  73. break
  74. # print the error counter (with percentage)
  75. print(
  76. f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
  77. )
  78. print(
  79. f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
  80. )
  81. print(
  82. f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
  83. )
  84. print(
  85. f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
  86. )
  87. assert len(num_turns) == num_lines
  88. assert len(main_agent_cost) == num_lines
  89. assert len(editor_cost) == num_lines
  90. print('## Statistics')
  91. print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
  92. print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
  93. print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
  94. print(
  95. f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
  96. )
  97. print('## Detailed error breakdown:')
  98. for error, count in error_counter.items():
  99. print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')