Преглед изворни кода

(eval) Aider_bench: add eval_ids arg to run specific instance id's (#3592)

* add eval_ids arg to run specific instance id's; fix/extend README

* fix description in parser for --eval-ids

* fix test_arg_parser.py to account for added arg

* fix typo in README to say "summarize" instead of "summarise" for script
tobitege пре 1 година
родитељ
комит
8fcf0817d4

+ 26 - 19
evaluation/aider_bench/README.md

@@ -16,42 +16,49 @@ development environment and LLM.
 ## Start the evaluation
 ## Start the evaluation
 
 
 ```bash
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
 ```
 ```
 
 
--   `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for
     your LLM settings, as defined in your `config.toml`.
     your LLM settings, as defined in your `config.toml`.
--   `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
-    you would like to evaluate. It could also be a release tag like `0.6.2`.
--   `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version
+    you would like to evaluate. It could also be a release tag like `0.9.0`.
+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks,
     defaulting to `CodeActAgent`.
     defaulting to `CodeActAgent`.
--   `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit`
     instances. By default, the script evaluates the entire Exercism test set
     instances. By default, the script evaluates the entire Exercism test set
     (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
     (133 issues). Note: in order to use `eval_limit`, you must also set `agent`.
+- `eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `eval_ids`, e.g. `"1,3,10"`, limits the evaluation to instances with the
+    given IDs (comma separated).
 
 
 Following is the basic command to start the evaluation.
 Following is the basic command to start the evaluation.
 
 
 You can update the arguments in the script
 You can update the arguments in the script
-`evaluation/agent_bench/scripts/run_infer.sh`, such as `--max-iterations`,
-`--eval-num-workers` and so on.
-
--   `--agent-cls`, the agent to use. For example, `CodeActAgent`.
--   `--llm-config`: the LLM configuration to use. For example,
-    `eval_gpt4_1106_preview`.
--   `--max-iterations`: the number of iterations to run the evaluation. For
-    example, `30`.
--   `--eval-num-workers`: the number of workers to use for evaluation. For
-    example, `5`.
--   `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+`evaluation/aider_bench/scripts/run_infer.sh`, such as `--max-iterations`,
+`--eval-num-workers` and so on:
+
+- `--agent-cls`, the agent to use. For example, `CodeActAgent`.
+- `--llm-config`: the LLM configuration to use. For example, `eval_gpt4_1106_preview`.
+- `--max-iterations`: the max allowed number of iterations to run the evaluation. Default: `30`.
+- `--eval-num-workers`: the number of workers to use for evaluation. Default: `1`.
+- `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
+- `--eval-ids`: the IDs of the examples to evaluate (comma separated). For example, `"1,3,10"`.
 
 
 ```bash
 ```bash
-./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
+./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
 ```
 ```
 
 
 ## Summarize Results
 ## Summarize Results
 
 
 ```bash
 ```bash
-poetry run python ./evaluation/agent_bench/scripts/summarise_results.py [path_to_output_jsonl_file]
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py [path_to_output_jsonl_file]
+```
+
+Full example:
+
+```bash
+poetry run python ./evaluation/aider_bench/scripts/summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl
 ```
 ```
 
 
 This will list the instances that passed and the instances that failed. For each
 This will list the instances that passed and the instances that failed. For each

+ 10 - 1
evaluation/aider_bench/run_infer.py

@@ -245,7 +245,16 @@ if __name__ == '__main__':
         args.eval_output_dir,
         args.eval_output_dir,
     )
     )
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
     output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
-    instances = prepare_dataset(aider_bench_tests, output_file, args.eval_n_limit)
+
+    # Parse dataset IDs if provided
+    eval_ids = None
+    if args.eval_ids:
+        eval_ids = str(args.eval_ids).split(',')
+        logger.info(f'Using specific dataset IDs: {eval_ids}')
+
+    instances = prepare_dataset(
+        aider_bench_tests, output_file, args.eval_n_limit, eval_ids=eval_ids
+    )
 
 
     asyncio.run(
     asyncio.run(
         run_evaluation(
         run_evaluation(

+ 6 - 0
evaluation/aider_bench/scripts/run_infer.sh

@@ -8,6 +8,7 @@ COMMIT_HASH=$2
 AGENT=$3
 AGENT=$3
 EVAL_LIMIT=$4
 EVAL_LIMIT=$4
 NUM_WORKERS=$5
 NUM_WORKERS=$5
+EVAL_IDS=$6
 
 
 if [ -z "$NUM_WORKERS" ]; then
 if [ -z "$NUM_WORKERS" ]; then
   NUM_WORKERS=1
   NUM_WORKERS=1
@@ -39,5 +40,10 @@ if [ -n "$EVAL_LIMIT" ]; then
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
   COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 fi
 
 
+if [ -n "$EVAL_IDS" ]; then
+  echo "EVAL_IDS: $EVAL_IDS"
+  COMMAND="$COMMAND --eval-ids $EVAL_IDS"
+fi
+
 # Run the command
 # Run the command
 eval $COMMAND
 eval $COMMAND

+ 11 - 2
evaluation/utils/shared.py

@@ -164,7 +164,12 @@ def make_metadata(
     return metadata
     return metadata
 
 
 
 
-def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
+def prepare_dataset(
+    dataset: pd.DataFrame,
+    output_file: str,
+    eval_n_limit: int,
+    eval_ids: list[str] | None = None,
+):
     assert (
     assert (
         'instance_id' in dataset.columns
         'instance_id' in dataset.columns
     ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
     ), "Expected 'instance_id' column in the dataset. You should define your own unique identifier for each instance and use it as the 'instance_id' column."
@@ -180,7 +185,11 @@ def prepare_dataset(dataset: pd.DataFrame, output_file: str, eval_n_limit: int):
             f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
             f'Output file {output_file} already exists. Loaded {len(finished_ids)} finished instances.'
         )
         )
 
 
-    if eval_n_limit:
+    if eval_ids:
+        eval_ids_converted = [dataset[id_column].dtype.type(id) for id in eval_ids]
+        dataset = dataset[dataset[id_column].isin(eval_ids_converted)]
+        logger.info(f'Limiting evaluation to {len(eval_ids)} specific instances.')
+    elif eval_n_limit:
         dataset = dataset.head(eval_n_limit)
         dataset = dataset.head(eval_n_limit)
         logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
         logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
 
 

+ 6 - 0
openhands/core/config.py

@@ -740,6 +740,12 @@ def get_parser() -> argparse.ArgumentParser:
         type=str,
         type=str,
         help='Name for the session',
         help='Name for the session',
     )
     )
+    parser.add_argument(
+        '--eval-ids',
+        default=None,
+        type=str,
+        help='The comma-separated list (in quotes) of IDs of the instances to evaluate',
+    )
     return parser
     return parser
 
 
 
 

+ 1 - 1
poetry.lock

@@ -9457,4 +9457,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 [metadata]
 [metadata]
 lock-version = "2.0"
 lock-version = "2.0"
 python-versions = "^3.11"
 python-versions = "^3.11"
-content-hash = "ea650e78171ccd3088112c232ca9b09b180db502bc45a22feaf313acfeaf83b6"
+content-hash = "f6abf770480dfd3a739d3d0b4499b601df44f130b27684f34b5f6791950e99d8"

+ 1 - 2
pyproject.toml

@@ -52,6 +52,7 @@ PyPDF2 = "*"
 python-pptx = "*"
 python-pptx = "*"
 pylatexenc = "*"
 pylatexenc = "*"
 tornado = "*"
 tornado = "*"
+python-dotenv = "*"
 
 
 [tool.poetry.group.llama-index.dependencies]
 [tool.poetry.group.llama-index.dependencies]
 llama-index = "*"
 llama-index = "*"
@@ -82,7 +83,6 @@ reportlab = "*"
 [tool.coverage.run]
 [tool.coverage.run]
 concurrency = ["gevent"]
 concurrency = ["gevent"]
 
 
-
 [tool.poetry.group.runtime.dependencies]
 [tool.poetry.group.runtime.dependencies]
 jupyterlab = "*"
 jupyterlab = "*"
 notebook = "*"
 notebook = "*"
@@ -113,7 +113,6 @@ ignore = ["D1"]
 [tool.ruff.lint.pydocstyle]
 [tool.ruff.lint.pydocstyle]
 convention = "google"
 convention = "google"
 
 
-
 [tool.poetry.group.evaluation.dependencies]
 [tool.poetry.group.evaluation.dependencies]
 streamlit = "*"
 streamlit = "*"
 whatthepatch = "*"
 whatthepatch = "*"

+ 3 - 2
tests/unit/test_arg_parser.py

@@ -104,7 +104,7 @@ def test_help_message(capsys):
         parser.parse_args(['--help'])
         parser.parse_args(['--help'])
     captured = capsys.readouterr()
     captured = capsys.readouterr()
     help_output = captured.out
     help_output = captured.out
-
+    print(help_output)
     expected_elements = [
     expected_elements = [
         'usage:',
         'usage:',
         'Run an agent with a specific task',
         'Run an agent with a specific task',
@@ -120,6 +120,7 @@ def test_help_message(capsys):
         '--eval-n-limit EVAL_N_LIMIT',
         '--eval-n-limit EVAL_N_LIMIT',
         '--eval-num-workers EVAL_NUM_WORKERS',
         '--eval-num-workers EVAL_NUM_WORKERS',
         '--eval-note EVAL_NOTE',
         '--eval-note EVAL_NOTE',
+        '--eval-ids EVAL_IDS',
         '-l LLM_CONFIG, --llm-config LLM_CONFIG',
         '-l LLM_CONFIG, --llm-config LLM_CONFIG',
         '-n NAME, --name NAME',
         '-n NAME, --name NAME',
     ]
     ]
@@ -128,4 +129,4 @@ def test_help_message(capsys):
         assert element in help_output, f"Expected '{element}' to be in the help message"
         assert element in help_output, f"Expected '{element}' to be in the help message"
 
 
     option_count = help_output.count('  -')
     option_count = help_output.count('  -')
-    assert option_count == 13, f'Expected 13 options, found {option_count}'
+    assert option_count == 14, f'Expected 14 options, found {option_count}'