| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279 |
- #!/usr/bin/env python3
- import argparse
- import glob
- import json
- import os
- from collections import Counter
- import pandas as pd
- from openhands.events.serialization import event_from_dict
- from openhands.events.utils import get_pairs_from_events
- ERROR_KEYWORDS = [
- 'Agent encountered an error while processing the last action',
- 'APIError',
- 'Action execution failed',
- 'litellm.Timeout: APITimeoutError',
- ]
- def process_file(file_path):
- with open(file_path, 'r') as file:
- lines = file.readlines()
- num_lines = len(lines)
- num_error_lines = 0
- num_agent_stuck_in_loop = 0
- num_resolved = 0
- num_empty_patch = 0
- num_unfinished_runs = 0
- error_counter = Counter()
- main_agent_cost = []
- editor_cost = []
- num_turns = []
- for line in lines:
- _d = json.loads(line)
- if 'metrics' not in _d or _d['metrics'] is None:
- # this is a failed run
- num_unfinished_runs += 1
- continue
- # Cost
- costs = _d['metrics'].get('costs', [])
- _cur_main_agent_cost = 0
- _cur_editor_cost = 0
- for cost in costs:
- if isinstance(cost, float):
- # backward compatible
- _cur_main_agent_cost += cost
- else:
- if 'draft_editor' in cost['model']:
- _cur_editor_cost += cost['cost']
- else:
- _cur_main_agent_cost += cost['cost']
- main_agent_cost.append(_cur_main_agent_cost)
- editor_cost.append(_cur_editor_cost)
- # Turn status
- history = _d.get('history', [])
- events = [event_from_dict(event) for event in history]
- pairs = get_pairs_from_events(events)
- num_turns.append(len(pairs))
- # Patch & resolve status
- patch = _d.get('test_result', {}).get('git_patch', '')
- if patch == '':
- num_empty_patch += 1
- continue
- report = _d.get('report', {}) or {}
- resolved = report.get('resolved', False)
- if resolved:
- num_resolved += 1
- # Error
- error = _d.get('error', None)
- if error is not None and isinstance(error, str):
- agent_stuck_in_loop = 'Agent got stuck in a loop' in error
- contains_error = bool(error) and not agent_stuck_in_loop
- if agent_stuck_in_loop:
- error_counter['Agent got stuck in a loop'] += 1
- num_agent_stuck_in_loop += 1
- elif contains_error:
- error_counter[error] += 1
- continue
- for keyword in ERROR_KEYWORDS:
- if keyword in line:
- error_counter[keyword] += 1
- num_error_lines += 1
- break
- return {
- 'file_path': file_path,
- 'total_instances': num_lines,
- 'resolved': {
- 'count': num_resolved,
- 'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
- },
- 'empty_patches': {
- 'count': num_empty_patch,
- 'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
- },
- 'unfinished_runs': {
- 'count': num_unfinished_runs,
- 'percentage': (num_unfinished_runs / num_lines * 100)
- if num_lines > 0
- else 0,
- },
- 'errors': {
- 'total': num_error_lines,
- 'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
- 'stuck_in_loop': {
- 'count': num_agent_stuck_in_loop,
- 'percentage': (num_agent_stuck_in_loop / num_lines * 100)
- if num_lines > 0
- else 0,
- },
- 'breakdown': {
- str(error): {
- 'count': count,
- 'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
- }
- for error, count in error_counter.items()
- },
- },
- 'costs': {
- 'main_agent': sum(main_agent_cost),
- 'editor': sum(editor_cost),
- 'total': sum(main_agent_cost) + sum(editor_cost),
- },
- 'statistics': {
- 'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
- 'costs': {
- 'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
- 'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
- 'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
- if num_lines > 0
- else 0,
- },
- },
- }
- def aggregate_directory(input_path) -> pd.DataFrame:
- # Process all output.jsonl files in subdirectories
- pattern = os.path.join(input_path, '**/output.jsonl')
- files = glob.glob(pattern, recursive=True)
- print(f'Processing {len(files)} files from directory {input_path}')
- # Process each file silently and collect results
- results = []
- for file_path in files:
- try:
- result = process_file(file_path)
- results.append(result)
- except Exception as e:
- print(f'Error processing {file_path}: {str(e)}')
- import traceback
- traceback.print_exc()
- continue
- # Convert results to pandas DataFrame and sort by resolve rate
- df = pd.DataFrame(results)
- # Extract directory name from file path
- df['directory'] = df['file_path'].apply(
- lambda x: os.path.basename(os.path.dirname(x))
- )
- df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
- df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
- df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
- df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
- df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
- df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
- df = df.sort_values('resolve_rate', ascending=False)
- return df
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument(
- 'input_path', type=str, help='The file or directory to summarize'
- )
- parser.add_argument(
- '--output',
- type=str,
- help='Output JSONL file for results',
- default='summary_results.jsonl',
- )
- args = parser.parse_args()
- if os.path.isdir(args.input_path):
- df = aggregate_directory(args.input_path)
- # Create the summary string
- columns = [
- 'directory',
- 'resolve_rate',
- 'empty_patch_rate',
- 'unfinished_rate',
- 'error_rate',
- 'avg_turns',
- 'avg_cost',
- 'total_instances',
- ]
- summary_str = df[columns].to_string(
- float_format=lambda x: '{:.2f}'.format(x),
- formatters={
- 'directory': lambda x: x[:90]
- }, # Truncate directory names to 20 chars
- index=False,
- )
- # Print to console
- print('\nResults summary (sorted by resolve rate):')
- print(summary_str)
- # Save to text file
- txt_output = args.output.rsplit('.', 1)[0] + '.txt'
- with open(txt_output, 'w') as f:
- f.write('Results summary (sorted by resolve rate):\n')
- f.write(summary_str)
- # Save
- df.to_json(args.output, lines=True, orient='records')
- df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
- else:
- # Process single file with detailed output
- results = []
- try:
- result = process_file(args.input_path)
- results.append(result)
- # Print detailed results for single file
- print(f'\nResults for {args.input_path}:')
- print(
- f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
- )
- print(
- f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
- )
- print(
- f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
- )
- print(
- f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
- )
- print(
- f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
- )
- print(f"Total cost: {result['costs']['total']:.2f} USD")
- print('## Statistics')
- print(
- f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
- )
- print(
- f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
- )
- print(
- f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
- )
- print(
- f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
- )
- print('## Detailed error breakdown:')
- for error, data in result['errors']['breakdown'].items():
- print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
- except Exception as e:
- print(f'Error processing {args.input_path}: {str(e)}')
|