| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- import argparse
- import json
- import os
- from collections import defaultdict
- import pandas as pd
- parser = argparse.ArgumentParser()
- parser.add_argument('input_file', type=str)
- args = parser.parse_args()
- dirname = os.path.dirname(args.input_file)
- df = pd.read_json(args.input_file, lines=True)
- instance_id_to_status = defaultdict(
- lambda: {
- 'empty_generation': False,
- 'resolved': False,
- 'failed_apply_patch': False,
- 'error_eval': False,
- 'test_timeout': False,
- }
- )
- # Apply the status to the dataframe
- def apply_report(row):
- instance_id = row['instance_id']
- if instance_id in instance_id_to_status:
- return dict(instance_id_to_status[instance_id])
- return row.get('report', {})
- swebench_official_report_json = os.path.join(dirname, 'report.json')
- openhands_remote_report_jsonl = args.input_file.replace(
- '.jsonl', '.swebench_eval.jsonl'
- )
- if os.path.exists(swebench_official_report_json):
- output_md_filepath = os.path.join(dirname, 'README.md')
- with open(swebench_official_report_json, 'r') as f:
- report = json.load(f)
- output_md = (
- "# SWE-bench Report\n"
- "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
- "## Summary\n"
- f"- total instances: {report['total_instances']}\n"
- f"- submitted instances: {report['submitted_instances']}\n"
- f"- completed instances: {report['completed_instances']}\n"
- f"- empty patch instances: {report['empty_patch_instances']}\n"
- f"- resolved instances: {report['resolved_instances']}\n"
- f"- unresolved instances: {report['unresolved_instances']}\n"
- f"- error instances: {report['error_instances']}\n"
- f"- unstopped instances: {report['unstopped_instances']}\n"
- )
- output_md += '\n## Resolved Instances\n'
- # instance_id to status
- for instance_id in report['resolved_ids']:
- instance_id_to_status[instance_id]['resolved'] = True
- output_md += (
- f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
- )
- output_md += '\n## Unresolved Instances\n'
- for instance_id in report['unresolved_ids']:
- output_md += (
- f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
- )
- output_md += '\n## Error Instances\n'
- for instance_id in report['error_ids']:
- instance_id_to_status[instance_id]['error_eval'] = True
- output_md += (
- f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
- )
- output_md += '\n## Empty Patch Instances\n'
- for instance_id in report['empty_patch_ids']:
- instance_id_to_status[instance_id]['empty_generation'] = True
- output_md += (
- f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
- )
- output_md += '\n## Incomplete Instances\n'
- for instance_id in report['incomplete_ids']:
- output_md += (
- f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
- )
- df['report'] = df.apply(apply_report, axis=1)
- with open(output_md_filepath, 'w') as f:
- f.write(output_md)
- elif os.path.exists(openhands_remote_report_jsonl):
- output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
- df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
- assert len(df['instance_id'].unique()) == len(
- df
- ), 'There are duplicate instance ids in the original output which is not allowed'
- assert len(df_eval['instance_id'].unique()) == len(
- df_eval
- ), 'There are duplicate instance ids in the eval report which is not allowed'
- for _, row in df_eval.iterrows():
- instance_id_to_status[row['instance_id']] = row['test_result']['report']
- df['report'] = df.apply(apply_report, axis=1)
- _n_instances = len(df)
- _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
- _n_unresolved = _n_instances - _n_resolved
- _n_empty_patch = len(
- df[df['report'].apply(lambda x: x.get('empty_generation', False))]
- )
- _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
- output_md = (
- '# SWE-bench Report\n'
- 'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
- '## Summary\n'
- f'- submitted instances: {_n_instances}\n'
- f'- empty patch instances: {_n_empty_patch}\n'
- f'- resolved instances: {_n_resolved}\n'
- f'- unresolved instances: {_n_unresolved}\n'
- f'- error instances: {_n_error}\n'
- )
- def _instance_id_to_log_path(instance_id):
- path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
- # make it relative path
- path = os.path.relpath(path, start=dirname)
- return path
- output_md += '\n## Resolved Instances\n'
- # instance_id to status
- for instance_id in sorted(
- df[df['report'].apply(lambda x: x.get('resolved', False))][
- 'instance_id'
- ].unique()
- ):
- instance_id_to_status[instance_id]['resolved'] = True
- output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
- output_md += '\n## Unresolved Instances\n'
- for instance_id in sorted(
- df[~df['report'].apply(lambda x: x.get('resolved', False))][
- 'instance_id'
- ].unique()
- ):
- output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
- output_md += '\n## Error Instances\n'
- for instance_id in sorted(
- df[df['report'].apply(lambda x: x.get('error_eval', False))][
- 'instance_id'
- ].unique()
- ):
- instance_id_to_status[instance_id]['error_eval'] = True
- output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
- output_md += '\n## Empty Patch Instances\n'
- for instance_id in sorted(
- df[df['report'].apply(lambda x: x.get('empty_generation', False))][
- 'instance_id'
- ].unique()
- ):
- instance_id_to_status[instance_id]['empty_generation'] = True
- output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
- output_md += '\n## Incomplete Instances\n'
- for instance_id in sorted(
- df[df['report'].apply(lambda x: x.get('test_timeout', False))][
- 'instance_id'
- ].unique()
- ):
- output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
- with open(output_md_filepath, 'w') as f:
- f.write(output_md)
- else:
- print(
- f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
- )
- exit()
- if os.path.exists(args.input_file + '.bak'):
- conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
- if conf != 'y':
- exit()
- os.remove(args.input_file + '.bak')
- # backup the original file
- os.rename(args.input_file, args.input_file + '.bak')
- df.to_json(args.input_file, orient='records', lines=True)
|