update_output_with_eval.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import argparse
  2. import json
  3. import os
  4. from collections import defaultdict
  5. import pandas as pd
  6. parser = argparse.ArgumentParser()
  7. parser.add_argument('input_file', type=str)
  8. args = parser.parse_args()
  9. dirname = os.path.dirname(args.input_file)
  10. report_json = os.path.join(dirname, 'report.json')
  11. df = pd.read_json(args.input_file, lines=True)
  12. output_md_filepath = os.path.join(dirname, 'README.md')
  13. instance_id_to_status = defaultdict(
  14. lambda: {'resolved': False, 'empty_generation': False}
  15. )
  16. if os.path.exists(report_json):
  17. with open(report_json, 'r') as f:
  18. report = json.load(f)
  19. output_md = (
  20. "# SWE-bench Report\n"
  21. "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
  22. "## Summary\n"
  23. f"- total instances: {report['total_instances']}\n"
  24. f"- submitted instances: {report['submitted_instances']}\n"
  25. f"- completed instances: {report['completed_instances']}\n"
  26. f"- empty patch instances: {report['empty_patch_instances']}\n"
  27. f"- resolved instances: {report['resolved_instances']}\n"
  28. f"- unresolved instances: {report['unresolved_instances']}\n"
  29. f"- error instances: {report['error_instances']}\n"
  30. f"- unstopped instances: {report['unstopped_instances']}\n"
  31. )
  32. output_md += '\n## Resolved Instances\n'
  33. # instance_id to status
  34. for instance_id in report['resolved_ids']:
  35. instance_id_to_status[instance_id]['resolved'] = True
  36. output_md += (
  37. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  38. )
  39. output_md += '\n## Unresolved Instances\n'
  40. for instance_id in report['unresolved_ids']:
  41. output_md += (
  42. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  43. )
  44. output_md += '\n## Error Instances\n'
  45. for instance_id in report['error_ids']:
  46. instance_id_to_status[instance_id]['error_eval'] = True
  47. output_md += (
  48. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  49. )
  50. output_md += '\n## Empty Patch Instances\n'
  51. for instance_id in report['empty_patch_ids']:
  52. instance_id_to_status[instance_id]['empty_generation'] = True
  53. output_md += (
  54. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  55. )
  56. output_md += '\n## Incomplete Instances\n'
  57. for instance_id in report['incomplete_ids']:
  58. output_md += (
  59. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  60. )
  61. # Apply the status to the dataframe
  62. def apply_report(row):
  63. instance_id = row['instance_id']
  64. if instance_id in instance_id_to_status:
  65. return dict(instance_id_to_status[instance_id])
  66. return row.get('report', {})
  67. df['report'] = df.apply(apply_report, axis=1)
  68. if os.path.exists(args.input_file + '.bak'):
  69. conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
  70. if conf != 'y':
  71. exit()
  72. os.remove(args.input_file + '.bak')
  73. # backup the original file
  74. os.rename(args.input_file, args.input_file + '.bak')
  75. df.to_json(args.input_file, orient='records', lines=True)
  76. with open(output_md_filepath, 'w') as f:
  77. f.write(output_md)