| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118 |
- #!/usr/bin/env python3
- import argparse
- import json
- from collections import Counter
- from openhands.events.serialization import event_from_dict
- from openhands.events.utils import get_pairs_from_events
- ERROR_KEYWORDS = [
- 'Agent encountered an error while processing the last action',
- 'APIError',
- 'Action execution failed',
- ]
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('output_file', type=str, help='The file to summarize')
- args = parser.parse_args()
- with open(args.output_file, 'r') as file:
- lines = file.readlines()
- num_lines = len(lines)
- num_error_lines = 0
- num_agent_stuck_in_loop = 0
- num_resolved = 0
- num_empty_patch = 0
- error_counter = Counter()
- main_agent_cost = []
- editor_cost = []
- num_turns = []
- for line in lines:
- _d = json.loads(line)
- # Cost
- costs = _d['metrics'].get('costs', [])
- _cur_main_agent_cost = 0
- _cur_editor_cost = 0
- for cost in costs:
- if isinstance(cost, float):
- # backward compatible
- _cur_main_agent_cost += cost
- else:
- if 'draft_editor' in cost['model']:
- _cur_editor_cost += cost['cost']
- else:
- _cur_main_agent_cost += cost['cost']
- main_agent_cost.append(_cur_main_agent_cost)
- editor_cost.append(_cur_editor_cost)
- # Turn status
- history = _d.get('history', [])
- events = [event_from_dict(event) for event in history]
- pairs = get_pairs_from_events(events)
- num_turns.append(len(pairs))
- # Patch & resolve status
- patch = _d.get('test_result', {}).get('git_patch', '')
- if patch == '':
- num_empty_patch += 1
- continue
- report = _d.get('report', {}) or {}
- resolved = report.get('resolved', False)
- if resolved:
- num_resolved += 1
- # Error
- error = _d.get('error', None)
- if error is not None and isinstance(error, str):
- agent_stuck_in_loop = 'Agent got stuck in a loop' in error
- contains_error = bool(error) and not agent_stuck_in_loop
- if agent_stuck_in_loop:
- error_counter['Agent got stuck in a loop'] += 1
- num_agent_stuck_in_loop += 1
- elif contains_error:
- error_counter[error] += 1
- continue
- for keyword in ERROR_KEYWORDS:
- if keyword in line:
- error_counter[keyword] += 1
- num_error_lines += 1
- break
- # print the error counter (with percentage)
- print(
- f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
- )
- print(
- f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
- )
- print(
- f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
- )
- print(
- f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
- )
- assert len(num_turns) == num_lines
- assert len(main_agent_cost) == num_lines
- assert len(editor_cost) == num_lines
- print('## Statistics')
- print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
- print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
- print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
- print(
- f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
- )
- print('## Detailed error breakdown:')
- for error, count in error_counter.items():
- print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
|