1 年之前 · 4d14b44a9a
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -116,9 +116,11 @@ selected_ids = ['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'scikit-learn__
 
				 Then only these tasks (rows whose `instance_id` is in the above list) will be evaluated.
			
 
				 In this case, `eval_limit` option applies to tasks that are in the `selected_ids` list.
			
 
				 
			
 
				+After running the inference, you will obtain a `output.jsonl` (by default it will be saved to `evaluation/evaluation_outputs`).
			
 
				+
			
 
				 ## Evaluate Generated Patches
			
 
				 
			
 
				-After running the inference described in the previous section, you will obtain a `output.jsonl` (by default it will save to `evaluation/evaluation_outputs`). Then you can run this one line script to evaluate generated patches, and produce a fine-grained report:
			
 
				+With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report.
			
 
				 
			
 
				 If you want to evaluate existing results, you should first run this to clone existing outputs
			
 
				 
			
@@ -185,6 +187,15 @@ It will contains an additional field `fine_grained_report` (see example below) c
 
				 
			
 
				 Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin).
			
 
				 
			
 
				+## View Result Summary
			
 
				+
			
 
				+If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run
			
 
				+
			
 
				+```bash
			
 
				+poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_output_merged_jsonl_file>
			
 
				+# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/output.merged.jsonl
			
 
				+```
			
 
				+
			
 
				 ## Submit your evaluation results
			
 
				 
			
 
				 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
			
--- a/evaluation/swe_bench/scripts/summarise_results.py
+++ b/evaluation/swe_bench/scripts/summarise_results.py
@@ -0,0 +1,39 @@
 
				+import json
			
 
				+import sys
			
 
				+
			
 
				+
			
 
				+def extract_test_results(json_file_path):
			
 
				+    passed_tests = []
			
 
				+    failed_tests = []
			
 
				+    with open(json_file_path, 'r') as file:
			
 
				+        for line in file:
			
 
				+            data = json.loads(line.strip())
			
 
				+            instance_id = data['instance_id']
			
 
				+            resolved = False
			
 
				+            if 'fine_grained_report' in data:
			
 
				+                resolved = data['fine_grained_report']['resolved']
			
 
				+            else:
			
 
				+                resolved = data['test_result']['result']['resolved']
			
 
				+            if resolved:
			
 
				+                passed_tests.append(instance_id)
			
 
				+            else:
			
 
				+                failed_tests.append(instance_id)
			
 
				+    return passed_tests, failed_tests
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    if len(sys.argv) != 2:
			
 
				+        print(
			
 
				+            'Usage: poetry run python summarise_results.py <path_to_output_merged_jsonl_file>'
			
 
				+        )
			
 
				+        sys.exit(1)
			
 
				+    json_file_path = sys.argv[1]
			
 
				+    passed_tests, failed_tests = extract_test_results(json_file_path)
			
 
				+    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
			
 
				+    print(
			
 
				+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
			
 
				+    )
			
 
				+    print('PASSED TESTS:')
			
 
				+    print(passed_tests)
			
 
				+    print('FAILED TESTS:')
			
 
				+    print(failed_tests)