summarize_results.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. import argparse
  2. import numpy as np
  3. import pandas as pd
  4. def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
  5. passed = []
  6. failed = []
  7. for _, row in df.iterrows():
  8. instance_id = row['instance_id']
  9. resolved = False
  10. if 'test_result' in row and 'exit_code' in row['test_result']:
  11. resolved = row['test_result']['exit_code'] == 0
  12. if resolved:
  13. passed.append(instance_id)
  14. else:
  15. failed.append(instance_id)
  16. return passed, failed
  17. def visualize_results(df: pd.DataFrame):
  18. df1 = pd.DataFrame()
  19. df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
  20. df1['result'] = (
  21. df['test_result'].apply(pd.Series)['exit_code'].map({0: 'Pass', 1: 'Fail'})
  22. )
  23. df1['actions'] = pd.Series([len(a) - 1 for a in df['history']])
  24. passed = np.sum(df1['result'] == 'Pass')
  25. total = df.shape[0]
  26. resolve_rate = round((passed / total) * 100, 2)
  27. print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
  28. print('\nDescriptive statistics for number of actions:')
  29. print(df1['actions'].describe())
  30. print('\nDescriptive statistics for costs:')
  31. print(df1['cost'].describe())
  32. # Bin counts for actions
  33. action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
  34. print('\nAction bin counts:')
  35. print(action_bins.value_counts().sort_index())
  36. # Bin counts for costs
  37. cost_bins = pd.cut(df1['cost'], bins=10)
  38. print('\nCost bin counts:')
  39. print(cost_bins.value_counts().sort_index())
  40. return resolve_rate
  41. if __name__ == '__main__':
  42. parser = argparse.ArgumentParser(description='Summarize AiderBench results')
  43. parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
  44. args = parser.parse_args()
  45. # Create DataFrame from JSONL file
  46. df = pd.read_json(args.input_filepath, lines=True)
  47. passed_tests, failed_tests = extract_test_results(df)
  48. resolve_rate = visualize_results(df)
  49. print(
  50. f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
  51. )
  52. print('PASSED TESTS:')
  53. print(passed_tests)
  54. print('FAILED TESTS:')
  55. print(failed_tests)