1 год назад · e6cdf18d3b
--- a/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
+++ b/evaluation/swe_bench/scripts/eval/convert_od_output_to_swe_json.py
@@ -15,6 +15,9 @@ model_name = os.path.basename(os.path.dirname(args.od_output_file))
 
				 
			
 
				 
			
 
				 def process_git_patch(patch):
			
 
				+    if not isinstance(patch, str):
			
 
				+        return ''
			
 
				+
			
 
				     if not patch.strip():
			
 
				         # skip empty patches
			
 
				         return ''
			
--- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@@ -15,7 +15,9 @@ report_json = os.path.join(dirname, 'report.json')
 
				 df = pd.read_json(args.input_file, lines=True)
			
 
				 
			
 
				 output_md_filepath = os.path.join(dirname, 'README.md')
			
 
				-instance_id_to_status = defaultdict(lambda: {'resolved': False})
			
 
				+instance_id_to_status = defaultdict(
			
 
				+    lambda: {'resolved': False, 'empty_generation': False}
			
 
				+)
			
 
				 if os.path.exists(report_json):
			
 
				     with open(report_json, 'r') as f:
			
 
				         report = json.load(f)
			
@@ -25,7 +27,9 @@ if os.path.exists(report_json):
 
				         "This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n"
			
 
				         "## Summary\n"
			
 
				         f"- total instances: {report['total_instances']}\n"
			
 
				+        f"- submitted instances: {report['submitted_instances']}\n"
			
 
				         f"- completed instances: {report['completed_instances']}\n"
			
 
				+        f"- empty patch instances: {report['empty_patch_instances']}\n"
			
 
				         f"- resolved instances: {report['resolved_instances']}\n"
			
 
				         f"- unresolved instances: {report['unresolved_instances']}\n"
			
 
				         f"- error instances: {report['error_instances']}\n"
			
@@ -53,6 +57,19 @@ if os.path.exists(report_json):
 
				             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
			
 
				         )
			
 
				 
			
 
				+    output_md += '\n## Empty Patch Instances\n'
			
 
				+    for instance_id in report['empty_patch_ids']:
			
 
				+        instance_id_to_status[instance_id]['empty_generation'] = True
			
 
				+        output_md += (
			
 
				+            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
			
 
				+        )
			
 
				+
			
 
				+    output_md += '\n## Incomplete Instances\n'
			
 
				+    for instance_id in report['incomplete_ids']:
			
 
				+        output_md += (
			
 
				+            f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
			
 
				+        )
			
 
				+
			
 
				     # Apply the status to the dataframe
			
 
				     def apply_report(row):
			
 
				         instance_id = row['instance_id']
			
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -97,6 +97,11 @@ if [ -z "$INSTANCE_ID" ]; then
 
				 
			
 
				     # move the eval results to the target directory
			
 
				     mkdir -p $RESULT_OUTPUT_DIR
			
 
				+    # rm eval_outputs directory if it exists
			
 
				+    if [ -d $RESULT_OUTPUT_DIR/eval_outputs ]; then
			
 
				+        rm -rf $RESULT_OUTPUT_DIR/eval_outputs
			
 
				+    fi
			
 
				+
			
 
				     mv run_instance_logs/$RUN_ID/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR
			
 
				     mv $RESULT_OUTPUT_DIR/$MODEL_NAME_OR_PATH $RESULT_OUTPUT_DIR/eval_outputs
			
 
				     echo "RUN_ID: $RUN_ID" > $RESULT_OUTPUT_DIR/run_id.txt