|
|
@@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
dirname = os.path.dirname(args.input_file)
|
|
|
-report_json = os.path.join(dirname, 'report.json')
|
|
|
|
|
|
df = pd.read_json(args.input_file, lines=True)
|
|
|
|
|
|
-output_md_filepath = os.path.join(dirname, 'README.md')
|
|
|
instance_id_to_status = defaultdict(
|
|
|
- lambda: {'resolved': False, 'empty_generation': False}
|
|
|
+ lambda: {
|
|
|
+ 'empty_generation': False,
|
|
|
+ 'resolved': False,
|
|
|
+ 'failed_apply_patch': False,
|
|
|
+ 'error_eval': False,
|
|
|
+ 'test_timeout': False,
|
|
|
+ }
|
|
|
)
|
|
|
-if os.path.exists(report_json):
|
|
|
- with open(report_json, 'r') as f:
|
|
|
+
|
|
|
+
|
|
|
+# Apply the status to the dataframe
|
|
|
+def apply_report(row):
|
|
|
+ instance_id = row['instance_id']
|
|
|
+ if instance_id in instance_id_to_status:
|
|
|
+ return dict(instance_id_to_status[instance_id])
|
|
|
+ return row.get('report', {})
|
|
|
+
|
|
|
+
|
|
|
+swebench_official_report_json = os.path.join(dirname, 'report.json')
|
|
|
+openhands_remote_report_jsonl = args.input_file.replace(
|
|
|
+ '.jsonl', '.swebench_eval.jsonl'
|
|
|
+)
|
|
|
+
|
|
|
+if os.path.exists(swebench_official_report_json):
|
|
|
+ output_md_filepath = os.path.join(dirname, 'README.md')
|
|
|
+ with open(swebench_official_report_json, 'r') as f:
|
|
|
report = json.load(f)
|
|
|
|
|
|
output_md = (
|
|
|
@@ -70,15 +90,101 @@ if os.path.exists(report_json):
|
|
|
f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
|
|
|
)
|
|
|
|
|
|
- # Apply the status to the dataframe
|
|
|
- def apply_report(row):
|
|
|
- instance_id = row['instance_id']
|
|
|
- if instance_id in instance_id_to_status:
|
|
|
- return dict(instance_id_to_status[instance_id])
|
|
|
- return row.get('report', {})
|
|
|
+ df['report'] = df.apply(apply_report, axis=1)
|
|
|
+
|
|
|
+ with open(output_md_filepath, 'w') as f:
|
|
|
+ f.write(output_md)
|
|
|
|
|
|
+elif os.path.exists(openhands_remote_report_jsonl):
|
|
|
+ output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
|
|
|
+
|
|
|
+ df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
|
|
|
+
|
|
|
+ assert len(df['instance_id'].unique()) == len(
|
|
|
+ df
|
|
|
+ ), 'There are duplicate instance ids in the original output which is not allowed'
|
|
|
+ assert len(df_eval['instance_id'].unique()) == len(
|
|
|
+ df_eval
|
|
|
+ ), 'There are duplicate instance ids in the eval report which is not allowed'
|
|
|
+
|
|
|
+ for _, row in df_eval.iterrows():
|
|
|
+ instance_id_to_status[row['instance_id']] = row['test_result']['report']
|
|
|
df['report'] = df.apply(apply_report, axis=1)
|
|
|
|
|
|
+ _n_instances = len(df)
|
|
|
+ _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
|
|
|
+ _n_unresolved = _n_instances - _n_resolved
|
|
|
+ _n_empty_patch = len(
|
|
|
+ df[df['report'].apply(lambda x: x.get('empty_generation', False))]
|
|
|
+ )
|
|
|
+ _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
|
|
|
+ output_md = (
|
|
|
+ '# SWE-bench Report\n'
|
|
|
+ 'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
|
|
|
+ '## Summary\n'
|
|
|
+ f'- submitted instances: {_n_instances}\n'
|
|
|
+ f'- empty patch instances: {_n_empty_patch}\n'
|
|
|
+ f'- resolved instances: {_n_resolved}\n'
|
|
|
+ f'- unresolved instances: {_n_unresolved}\n'
|
|
|
+ f'- error instances: {_n_error}\n'
|
|
|
+ )
|
|
|
+
|
|
|
+ def _instance_id_to_log_path(instance_id):
|
|
|
+ path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
|
|
|
+ # make it relative path
|
|
|
+ path = os.path.relpath(path, start=dirname)
|
|
|
+ return path
|
|
|
+
|
|
|
+ output_md += '\n## Resolved Instances\n'
|
|
|
+ # instance_id to status
|
|
|
+ for instance_id in sorted(
|
|
|
+ df[df['report'].apply(lambda x: x.get('resolved', False))][
|
|
|
+ 'instance_id'
|
|
|
+ ].unique()
|
|
|
+ ):
|
|
|
+ instance_id_to_status[instance_id]['resolved'] = True
|
|
|
+ output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
|
|
+
|
|
|
+ output_md += '\n## Unresolved Instances\n'
|
|
|
+ for instance_id in sorted(
|
|
|
+ df[~df['report'].apply(lambda x: x.get('resolved', False))][
|
|
|
+ 'instance_id'
|
|
|
+ ].unique()
|
|
|
+ ):
|
|
|
+ output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
|
|
+
|
|
|
+ output_md += '\n## Error Instances\n'
|
|
|
+ for instance_id in sorted(
|
|
|
+ df[df['report'].apply(lambda x: x.get('error_eval', False))][
|
|
|
+ 'instance_id'
|
|
|
+ ].unique()
|
|
|
+ ):
|
|
|
+ instance_id_to_status[instance_id]['error_eval'] = True
|
|
|
+ output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
|
|
+
|
|
|
+ output_md += '\n## Empty Patch Instances\n'
|
|
|
+ for instance_id in sorted(
|
|
|
+ df[df['report'].apply(lambda x: x.get('empty_generation', False))][
|
|
|
+ 'instance_id'
|
|
|
+ ].unique()
|
|
|
+ ):
|
|
|
+ instance_id_to_status[instance_id]['empty_generation'] = True
|
|
|
+ output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
|
|
+
|
|
|
+ output_md += '\n## Incomplete Instances\n'
|
|
|
+ for instance_id in sorted(
|
|
|
+ df[df['report'].apply(lambda x: x.get('test_timeout', False))][
|
|
|
+ 'instance_id'
|
|
|
+ ].unique()
|
|
|
+ ):
|
|
|
+ output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
|
|
|
+ with open(output_md_filepath, 'w') as f:
|
|
|
+ f.write(output_md)
|
|
|
+else:
|
|
|
+ print(
|
|
|
+ f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
|
|
|
+ )
|
|
|
+ exit()
|
|
|
|
|
|
if os.path.exists(args.input_file + '.bak'):
|
|
|
conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
|
|
|
@@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
|
|
|
# backup the original file
|
|
|
os.rename(args.input_file, args.input_file + '.bak')
|
|
|
df.to_json(args.input_file, orient='records', lines=True)
|
|
|
-
|
|
|
-with open(output_md_filepath, 'w') as f:
|
|
|
- f.write(output_md)
|