compare_outputs.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. #!/usr/bin/env python3
  2. import argparse
  3. import pandas as pd
  4. parser = argparse.ArgumentParser(
  5. description='Compare two swe_bench output JSONL files and print the resolved diff'
  6. )
  7. parser.add_argument('input_file_1', type=str)
  8. parser.add_argument('input_file_2', type=str)
  9. args = parser.parse_args()
  10. df1 = pd.read_json(args.input_file_1, orient='records', lines=True)
  11. df2 = pd.read_json(args.input_file_2, orient='records', lines=True)
  12. # Get the intersection of the instance_ids
  13. df = pd.merge(df1, df2, on='instance_id', how='inner')
  14. def _get_resolved(report):
  15. if report is None:
  16. return False
  17. if isinstance(report, float):
  18. return False
  19. else:
  20. return report.get('resolved', False)
  21. df['resolved_x'] = df['report_x'].apply(_get_resolved)
  22. df['resolved_y'] = df['report_y'].apply(_get_resolved)
  23. df['diff'] = df.apply(lambda x: x['resolved_x'] != x['resolved_y'], axis=1)
  24. df_diff = df[df['diff']].sort_values(
  25. by=['resolved_x', 'resolved_y'], ascending=[False, False]
  26. )
  27. # skip if any of the resolved is nan, which means one of the eval is not finished yet
  28. df_diff = df_diff[df_diff['resolved_x'].notna() & df_diff['resolved_y'].notna()]
  29. print(f'X={args.input_file_1}')
  30. print(f'Y={args.input_file_2}')
  31. print(f'# diff={df_diff.shape[0]}')
  32. df_diff = df_diff[['instance_id', 'resolved_x', 'resolved_y', 'report_x', 'report_y']]
  33. # x resolved but y not
  34. print('-' * 100)
  35. df_diff_x_only = df_diff[df_diff['resolved_x'] & ~df_diff['resolved_y']].sort_values(
  36. by='instance_id'
  37. )
  38. print(f'# x resolved but y not={df_diff_x_only.shape[0]}')
  39. print(df_diff_x_only[['instance_id', 'report_x', 'report_y']])
  40. # y resolved but x not
  41. print('-' * 100)
  42. df_diff_y_only = df_diff[~df_diff['resolved_x'] & df_diff['resolved_y']].sort_values(
  43. by='instance_id'
  44. )
  45. print(f'# y resolved but x not={df_diff_y_only.shape[0]}')
  46. print(df_diff_y_only[['instance_id', 'report_x', 'report_y']])
  47. # get instance_id from df_diff_y_only
  48. print('-' * 100)
  49. print('Instances that x resolved but y not:')
  50. print(df_diff_x_only['instance_id'].tolist())
  51. print('-' * 100)
  52. print('Instances that y resolved but x not:')
  53. print(df_diff_y_only['instance_id'].tolist())