update_output_with_eval.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. import argparse
  2. import json
  3. import os
  4. from collections import defaultdict
  5. import pandas as pd
  6. parser = argparse.ArgumentParser()
  7. parser.add_argument('input_file', type=str)
  8. args = parser.parse_args()
  9. dirname = os.path.dirname(args.input_file)
  10. df = pd.read_json(args.input_file, lines=True)
  11. instance_id_to_status = defaultdict(
  12. lambda: {
  13. 'empty_generation': False,
  14. 'resolved': False,
  15. 'failed_apply_patch': False,
  16. 'error_eval': False,
  17. 'test_timeout': False,
  18. }
  19. )
  20. # Apply the status to the dataframe
  21. def apply_report(row):
  22. instance_id = row['instance_id']
  23. if instance_id in instance_id_to_status:
  24. return dict(instance_id_to_status[instance_id])
  25. return row.get('report', {})
  26. swebench_official_report_json = os.path.join(dirname, 'report.json')
  27. openhands_remote_report_jsonl = args.input_file.replace(
  28. '.jsonl', '.swebench_eval.jsonl'
  29. )
  30. if os.path.exists(swebench_official_report_json):
  31. output_md_filepath = os.path.join(dirname, 'README.md')
  32. with open(swebench_official_report_json, 'r') as f:
  33. report = json.load(f)
  34. output_md = (
  35. "# SWE-bench Report\n"
  36. "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
  37. "## Summary\n"
  38. f"- total instances: {report['total_instances']}\n"
  39. f"- submitted instances: {report['submitted_instances']}\n"
  40. f"- completed instances: {report['completed_instances']}\n"
  41. f"- empty patch instances: {report['empty_patch_instances']}\n"
  42. f"- resolved instances: {report['resolved_instances']}\n"
  43. f"- unresolved instances: {report['unresolved_instances']}\n"
  44. f"- error instances: {report['error_instances']}\n"
  45. f"- unstopped instances: {report['unstopped_instances']}\n"
  46. )
  47. output_md += '\n## Resolved Instances\n'
  48. # instance_id to status
  49. for instance_id in report['resolved_ids']:
  50. instance_id_to_status[instance_id]['resolved'] = True
  51. output_md += (
  52. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  53. )
  54. output_md += '\n## Unresolved Instances\n'
  55. for instance_id in report['unresolved_ids']:
  56. output_md += (
  57. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  58. )
  59. output_md += '\n## Error Instances\n'
  60. for instance_id in report['error_ids']:
  61. instance_id_to_status[instance_id]['error_eval'] = True
  62. output_md += (
  63. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  64. )
  65. output_md += '\n## Empty Patch Instances\n'
  66. for instance_id in report['empty_patch_ids']:
  67. instance_id_to_status[instance_id]['empty_generation'] = True
  68. output_md += (
  69. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  70. )
  71. output_md += '\n## Incomplete Instances\n'
  72. for instance_id in report['incomplete_ids']:
  73. output_md += (
  74. f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
  75. )
  76. df['report'] = df.apply(apply_report, axis=1)
  77. with open(output_md_filepath, 'w') as f:
  78. f.write(output_md)
  79. elif os.path.exists(openhands_remote_report_jsonl):
  80. output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
  81. df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
  82. assert len(df['instance_id'].unique()) == len(
  83. df
  84. ), 'There are duplicate instance ids in the original output which is not allowed'
  85. assert len(df_eval['instance_id'].unique()) == len(
  86. df_eval
  87. ), 'There are duplicate instance ids in the eval report which is not allowed'
  88. for _, row in df_eval.iterrows():
  89. instance_id_to_status[row['instance_id']] = row['test_result']['report']
  90. df['report'] = df.apply(apply_report, axis=1)
  91. _n_instances = len(df)
  92. _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
  93. _n_unresolved = _n_instances - _n_resolved
  94. _n_empty_patch = len(
  95. df[df['report'].apply(lambda x: x.get('empty_generation', False))]
  96. )
  97. _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
  98. output_md = (
  99. '# SWE-bench Report\n'
  100. 'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
  101. '## Summary\n'
  102. f'- submitted instances: {_n_instances}\n'
  103. f'- empty patch instances: {_n_empty_patch}\n'
  104. f'- resolved instances: {_n_resolved}\n'
  105. f'- unresolved instances: {_n_unresolved}\n'
  106. f'- error instances: {_n_error}\n'
  107. )
  108. def _instance_id_to_log_path(instance_id):
  109. path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
  110. # make it relative path
  111. path = os.path.relpath(path, start=dirname)
  112. return path
  113. output_md += '\n## Resolved Instances\n'
  114. # instance_id to status
  115. for instance_id in sorted(
  116. df[df['report'].apply(lambda x: x.get('resolved', False))][
  117. 'instance_id'
  118. ].unique()
  119. ):
  120. instance_id_to_status[instance_id]['resolved'] = True
  121. output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
  122. output_md += '\n## Unresolved Instances\n'
  123. for instance_id in sorted(
  124. df[~df['report'].apply(lambda x: x.get('resolved', False))][
  125. 'instance_id'
  126. ].unique()
  127. ):
  128. output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
  129. output_md += '\n## Error Instances\n'
  130. for instance_id in sorted(
  131. df[df['report'].apply(lambda x: x.get('error_eval', False))][
  132. 'instance_id'
  133. ].unique()
  134. ):
  135. instance_id_to_status[instance_id]['error_eval'] = True
  136. output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
  137. output_md += '\n## Empty Patch Instances\n'
  138. for instance_id in sorted(
  139. df[df['report'].apply(lambda x: x.get('empty_generation', False))][
  140. 'instance_id'
  141. ].unique()
  142. ):
  143. instance_id_to_status[instance_id]['empty_generation'] = True
  144. output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
  145. output_md += '\n## Incomplete Instances\n'
  146. for instance_id in sorted(
  147. df[df['report'].apply(lambda x: x.get('test_timeout', False))][
  148. 'instance_id'
  149. ].unique()
  150. ):
  151. output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
  152. with open(output_md_filepath, 'w') as f:
  153. f.write(output_md)
  154. else:
  155. print(
  156. f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
  157. )
  158. exit()
  159. if os.path.exists(args.input_file + '.bak'):
  160. conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
  161. if conf != 'y':
  162. exit()
  163. os.remove(args.input_file + '.bak')
  164. # backup the original file
  165. os.rename(args.input_file, args.input_file + '.bak')
  166. df.to_json(args.input_file, orient='records', lines=True)