1 jaar geleden · 4add8a5595
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -105,6 +105,17 @@ then your command would be:
 
				 ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				+If you would like to specify a list of tasks you'd like to benchmark on, you could
			
 
				+create a `config.toml` under `./evaluation/swe_bench/` folder, and put a list
			
 
				+attribute named `selected_ids`, e.g.
			
 
				+
			
 
				+```toml
			
 
				+selected_ids = ['sphinx-doc__sphinx-8721', 'sympy__sympy-14774', 'scikit-learn__scikit-learn-10508']
			
 
				+```
			
 
				+
			
 
				+Then only these tasks (rows whose `instance_id` is in the above list) will be evaluated.
			
 
				+In this case, `eval_limit` option applies to tasks that are in the `selected_ids` list.
			
 
				+
			
 
				 ## Evaluate Generated Patches
			
 
				 
			
 
				 After running the inference described in the previous section, you will obtain a `output.jsonl` (by default it will save to `evaluation/evaluation_outputs`). Then you can run this one line script to evaluate generated patches, and produce a fine-grained report:
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -9,6 +9,7 @@ import time
 
				 from concurrent.futures import ProcessPoolExecutor
			
 
				 
			
 
				 import pandas as pd
			
 
				+import toml
			
 
				 import whatthepatch
			
 
				 from datasets import load_dataset
			
 
				 from tqdm import tqdm
			
@@ -296,11 +297,27 @@ def process_instance(
 
				     return output
			
 
				 
			
 
				 
			
 
				+def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
			
 
				+    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
			
 
				+    if os.path.exists(file_path):
			
 
				+        with open(file_path, 'r') as file:
			
 
				+            data = toml.load(file)
			
 
				+            if 'selected_ids' in data:
			
 
				+                selected_ids = data['selected_ids']
			
 
				+                logger.info(
			
 
				+                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
			
 
				+                )
			
 
				+                subset = dataset[dataset[filter_column].isin(selected_ids)]
			
 
				+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
			
 
				+                return subset
			
 
				+    return dataset
			
 
				+
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				     # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				     # so we don't need to manage file uploading to OpenDevin's repo
			
 
				     dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
			
 
				-    swe_bench_tests = dataset['test'].to_pandas()
			
 
				+    swe_bench_tests = filter_dataset(dataset['test'].to_pandas(), 'instance_id')
			
 
				 
			
 
				     # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				     # for details of how to set `llm_config`