1 год назад · 4d14b44a9a
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -116,9 +116,11 @@ selected_ids = ['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'scikit-learn__
 
															 Then only these tasks (rows whose `instance_id` is in the above list) will be evaluated.
														
 
															 In this case, `eval_limit` option applies to tasks that are in the `selected_ids` list.
														
 
															+After running the inference, you will obtain a `output.jsonl` (by default it will be saved to `evaluation/evaluation_outputs`).
														
 
															+
														
 
															 ## Evaluate Generated Patches
														
 
															-After running the inference described in the previous section, you will obtain a `output.jsonl` (by default it will save to `evaluation/evaluation_outputs`). Then you can run this one line script to evaluate generated patches, and produce a fine-grained report:
														
 
															+With `output.jsonl` file, you can run `eval_infer.sh` to evaluate generated patches, and produce a fine-grained report.
														
 
															 If you want to evaluate existing results, you should first run this to clone existing outputs
														
@@ -185,6 +187,15 @@ It will contains an additional field `fine_grained_report` (see example below) c
 
															 Please refer to [EVAL_PATCH.md](./EVAL_PATCH.md) if you want to learn more about how to evaluate patches that are already generated (e.g., not by OpenDevin).
														
 
															+## View Result Summary
														
 
															+
														
 
															+If you just want to know the resolve rate, and/or a summary of what tests pass and what don't, you could run
														
 
															+
														
 
															+```bash
														
 
															+poetry run python ./evaluation/swe_bench/scripts/summarise_results.py <path_to_output_merged_jsonl_file>
														
 
															+# e.g. poetry run python ./evaluation/swe_bench/scripts/summarise_results.py ./evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActSWEAgent/gpt-4o-2024-05-13_maxiter_50_N_v1.5-no-hint/output.merged.jsonl
														
 
															+```
														
 
															+
														
 
															 ## Submit your evaluation results
														
 
															 You can start your own fork of [our huggingface evaluation outputs](https://huggingface.co/spaces/OpenDevin/evaluation) and submit a PR of your evaluation results following the guide [here](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions#pull-requests-and-discussions).
														
--- a/evaluation/swe_bench/scripts/summarise_results.py
+++ b/evaluation/swe_bench/scripts/summarise_results.py
@@ -0,0 +1,39 @@
 
															+import json
														
 
															+import sys
														
 
															+
														
 
															+
														
 
															+def extract_test_results(json_file_path):
														
 
															+    passed_tests = []
														
 
															+    failed_tests = []
														
 
															+    with open(json_file_path, 'r') as file:
														
 
															+        for line in file:
														
 
															+            data = json.loads(line.strip())
														
 
															+            instance_id = data['instance_id']
														
 
															+            resolved = False
														
 
															+            if 'fine_grained_report' in data:
														
 
															+                resolved = data['fine_grained_report']['resolved']
														
 
															+            else:
														
 
															+                resolved = data['test_result']['result']['resolved']
														
 
															+            if resolved:
														
 
															+                passed_tests.append(instance_id)
														
 
															+            else:
														
 
															+                failed_tests.append(instance_id)
														
 
															+    return passed_tests, failed_tests
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    if len(sys.argv) != 2:
														
 
															+        print(
														
 
															+            'Usage: poetry run python summarise_results.py <path_to_output_merged_jsonl_file>'
														
 
															+        )
														
 
															+        sys.exit(1)
														
 
															+    json_file_path = sys.argv[1]
														
 
															+    passed_tests, failed_tests = extract_test_results(json_file_path)
														
 
															+    succ_rate = len(passed_tests) / (len(passed_tests) + len(failed_tests))
														
 
															+    print(
														
 
															+        f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {succ_rate}'
														
 
															+    )
														
 
															+    print('PASSED TESTS:')
														
 
															+    print(passed_tests)
														
 
															+    print('FAILED TESTS:')
														
 
															+    print(failed_tests)