|
@@ -6,6 +6,8 @@ import os
|
|
|
from collections import Counter
|
|
from collections import Counter
|
|
|
|
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
+import random
|
|
|
|
|
+import numpy as np
|
|
|
|
|
|
|
|
from openhands.events.serialization import event_from_dict
|
|
from openhands.events.serialization import event_from_dict
|
|
|
from openhands.events.utils import get_pairs_from_events
|
|
from openhands.events.utils import get_pairs_from_events
|
|
@@ -18,6 +20,18 @@ ERROR_KEYWORDS = [
|
|
|
]
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def get_bootstrap_accuracy_error_bars(values: float | int | bool, num_samples: int = 1000, p_value=0.05) -> tuple[float, float]:
|
|
|
|
|
+ sorted_vals = np.sort(
|
|
|
|
|
+ [
|
|
|
|
|
+ np.mean(random.sample(values, len(values) // 2))
|
|
|
|
|
+ for _ in range(num_samples)
|
|
|
|
|
+ ]
|
|
|
|
|
+ )
|
|
|
|
|
+ bottom_idx = int(num_samples * p_value / 2)
|
|
|
|
|
+ top_idx = int(num_samples * (1.0 - p_value / 2))
|
|
|
|
|
+ return (sorted_vals[bottom_idx], sorted_vals[top_idx])
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def process_file(file_path):
|
|
def process_file(file_path):
|
|
|
with open(file_path, 'r') as file:
|
|
with open(file_path, 'r') as file:
|
|
|
lines = file.readlines()
|
|
lines = file.readlines()
|
|
@@ -26,6 +40,7 @@ def process_file(file_path):
|
|
|
num_error_lines = 0
|
|
num_error_lines = 0
|
|
|
num_agent_stuck_in_loop = 0
|
|
num_agent_stuck_in_loop = 0
|
|
|
num_resolved = 0
|
|
num_resolved = 0
|
|
|
|
|
+ resolved_arr = []
|
|
|
num_empty_patch = 0
|
|
num_empty_patch = 0
|
|
|
num_unfinished_runs = 0
|
|
num_unfinished_runs = 0
|
|
|
error_counter = Counter()
|
|
error_counter = Counter()
|
|
@@ -74,6 +89,9 @@ def process_file(file_path):
|
|
|
resolved = report.get('resolved', False)
|
|
resolved = report.get('resolved', False)
|
|
|
if resolved:
|
|
if resolved:
|
|
|
num_resolved += 1
|
|
num_resolved += 1
|
|
|
|
|
+ resolved_arr.append(1)
|
|
|
|
|
+ else:
|
|
|
|
|
+ resolved_arr.append(0)
|
|
|
|
|
|
|
|
# Error
|
|
# Error
|
|
|
error = _d.get('error', None)
|
|
error = _d.get('error', None)
|
|
@@ -100,6 +118,7 @@ def process_file(file_path):
|
|
|
'resolved': {
|
|
'resolved': {
|
|
|
'count': num_resolved,
|
|
'count': num_resolved,
|
|
|
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
|
|
'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
|
|
|
|
|
+ 'ci': tuple(x * 100 for x in get_bootstrap_accuracy_error_bars(resolved_arr)),
|
|
|
},
|
|
},
|
|
|
'empty_patches': {
|
|
'empty_patches': {
|
|
|
'count': num_empty_patch,
|
|
'count': num_empty_patch,
|
|
@@ -174,6 +193,7 @@ def aggregate_directory(input_path) -> pd.DataFrame:
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
|
|
df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
|
|
|
|
|
+ df['resolve_rate_ci'] = df['resolved'].apply(lambda x: x['ci'])
|
|
|
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
|
|
df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
|
|
|
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
|
|
df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
|
|
|
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
|
|
df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
|
|
@@ -242,7 +262,7 @@ if __name__ == '__main__':
|
|
|
# Print detailed results for single file
|
|
# Print detailed results for single file
|
|
|
print(f'\nResults for {args.input_path}:')
|
|
print(f'\nResults for {args.input_path}:')
|
|
|
print(
|
|
print(
|
|
|
- f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
|
|
|
|
|
|
|
+ f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}% [{result['resolved']['ci'][0]:.2f}%, {result['resolved']['ci'][1]:.2f}%])"
|
|
|
)
|
|
)
|
|
|
print(
|
|
print(
|
|
|
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
|
|
f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
|