Преглед изворни кода

[eval] update aider bench scripts (#4203)

Xingyao Wang пре 1 година
родитељ
комит
0c2a35b256

+ 11 - 2
evaluation/aider_bench/run_infer.py

@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import os
 import tempfile
 from typing import Any
@@ -24,6 +25,7 @@ from openhands.core.config import (
     AppConfig,
     SandboxConfig,
     get_llm_config_arg,
+    load_from_toml,
     parse_arguments,
 )
 from openhands.core.logger import openhands_logger as logger
@@ -49,7 +51,7 @@ def get_config(
         runtime='eventstream',
         max_iterations=metadata.max_iterations,
         sandbox=SandboxConfig(
-            base_container_image='python:3.12-bookworm',
+            base_container_image='python:3.11-bookworm',
             enable_auto_lint=True,
             use_host_network=False,
             timeout=100,
@@ -59,6 +61,13 @@ def get_config(
         workspace_mount_path=None,
     )
     config.set_llm_config(metadata.llm_config)
+
+    # copy 'draft_editor' config if exists
+    config_copy = copy.deepcopy(config)
+    load_from_toml(config_copy)
+    if 'draft_editor' in config_copy.llms:
+        config.set_llm_config(config_copy.llms['draft_editor'], 'draft_editor')
+
     return config
 
 
@@ -129,7 +138,7 @@ def complete_runtime(
         logger.info(f'Running test file: {script_name}')
 
     action = CmdRunAction(
-        command=f'python -m unittest {script_name}',
+        command=f'python3 -m unittest {script_name}',
         keep_prompt=False,
     )
     logger.info(action, extra={'msg_type': 'ACTION'})

+ 13 - 7
evaluation/aider_bench/scripts/run_infer.sh

@@ -27,19 +27,25 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
-  --agent-cls $AGENT \
-  --llm-config $MODEL_CONFIG \
-  --max-iterations 30 \
-  --max-chars 10000000 \
-  --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+EVAL_NOTE=$AGENT_VERSION
 
 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then
   export USE_UNIT_TESTS=false
 fi
 echo "USE_UNIT_TESTS: $USE_UNIT_TESTS"
+# If use unit tests, set EVAL_NOTE to the commit hash
+if [ "$USE_UNIT_TESTS" = true ]; then
+  EVAL_NOTE=$EVAL_NOTE-w-test
+fi
+
+COMMAND="export PYTHONPATH=evaluation/aider_bench:\$PYTHONPATH && poetry run python evaluation/aider_bench/run_infer.py \
+  --agent-cls $AGENT \
+  --llm-config $MODEL_CONFIG \
+  --max-iterations 30 \
+  --max-chars 10000000 \
+  --eval-num-workers $NUM_WORKERS \
+  --eval-note $EVAL_NOTE"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 32 - 97
evaluation/aider_bench/scripts/summarize_results.py

@@ -1,61 +1,25 @@
-import json
-import os
-import sys
+import argparse
 
 import numpy as np
 import pandas as pd
 
-# Try to import visualization libraries
-visualization_available = False
-try:
-    import matplotlib.pyplot as plt
-    import seaborn as sns
 
-    visualization_available = True
-except ImportError:
-    print(
-        '\n*** WARNING: libraries matplotlib and/or seaborn are not installed.\n*** Visualization will not be available!\n'
-    )
-
-
-def show_usage():
-    print(
-        'Usage: poetry run python summarize_results.py <path_to_output_jsonl_file> <model_name>'
-    )
-    print(
-        'Example:\npoetry run python summarize_results.py evaluation/evaluation_outputs/outputs/AiderBench/CodeActAgent/claude-3-5-sonnet@20240620_maxiter_30_N_v1.9/output.jsonl claude-3-5-sonnet@20240620\n'
-    )
-
-
-def print_error(message: str):
-    print(f'\n***\n*** ERROR: {message}\n***\n')
-    show_usage()
-
-
-def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
+def extract_test_results(df: pd.DataFrame) -> tuple[list[str], list[str]]:
     passed = []
     failed = []
-    with open(res_file_path, 'r') as file:
-        for line in file:
-            data = json.loads(line.strip())
-            instance_id = data['instance_id']
-            resolved = False
-            if 'test_result' in data and 'exit_code' in data['test_result']:
-                resolved = data['test_result']['exit_code'] == 0
-            if resolved:
-                passed.append(instance_id)
-            else:
-                failed.append(instance_id)
+    for _, row in df.iterrows():
+        instance_id = row['instance_id']
+        resolved = False
+        if 'test_result' in row and 'exit_code' in row['test_result']:
+            resolved = row['test_result']['exit_code'] == 0
+        if resolved:
+            passed.append(instance_id)
+        else:
+            failed.append(instance_id)
     return passed, failed
 
 
-def visualize_results(json_file_path: str, model: str, output_dir: str):
-    # based on a Colab notebook by RajMaheshwari
-    with open(json_file_path, 'r') as f:
-        data = [json.loads(line) for line in f]
-
-    df = pd.DataFrame.from_records(data)
-
+def visualize_results(df: pd.DataFrame):
     df1 = pd.DataFrame()
     df1['cost'] = df['metrics'].apply(pd.Series)['accumulated_cost']
     df1['result'] = (
@@ -67,60 +31,35 @@ def visualize_results(json_file_path: str, model: str, output_dir: str):
     total = df.shape[0]
     resolve_rate = round((passed / total) * 100, 2)
 
-    print('Number of passed tests:', f'{passed}/{total}')
-
-    if not visualization_available:
-        return resolve_rate
+    print('Number of passed tests:', f'{passed}/{total} {resolve_rate:.2f}%')
+    print('\nDescriptive statistics for number of actions:')
+    print(df1['actions'].describe())
+    print('\nDescriptive statistics for costs:')
+    print(df1['cost'].describe())
 
-    # Cost histogram
-    plt.figure(figsize=(10, 6))
-    bins = 10
-    mx = pd.Series.max(df1['cost'])
-    g = sns.histplot(df1, x='cost', bins=bins, hue='result', multiple='stack')
-    x_ticks = np.around(np.linspace(0, mx, bins + 1), 3)
-    g.set_xticks(x_ticks)
-    g.set_xlabel('Cost in $')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'cost_histogram.png'))
-    plt.close()
+    # Bin counts for actions
+    action_bins = pd.cut(df1['actions'], bins=range(0, 32, 2))
+    print('\nAction bin counts:')
+    print(action_bins.value_counts().sort_index())
 
-    # Actions histogram
-    plt.figure(figsize=(10, 6))
-    bins = np.arange(0, 31, 2)
-    g = sns.histplot(df1, x='actions', bins=bins, hue='result', multiple='stack')
-    g.set_xticks(bins)
-    g.set_xlabel('# of actions')
-    g.set_title(f'MODEL: {model}, RESOLVE_RATE: {resolve_rate}%', size=9)
-    plt.tight_layout()
-    plt.savefig(os.path.join(output_dir, 'actions_histogram.png'))
-    plt.close()
+    # Bin counts for costs
+    cost_bins = pd.cut(df1['cost'], bins=10)
+    print('\nCost bin counts:')
+    print(cost_bins.value_counts().sort_index())
 
     return resolve_rate
 
 
 if __name__ == '__main__':
-    if len(sys.argv) != 3:
-        print_error('Argument(s) missing!')
-        sys.exit(1)
+    parser = argparse.ArgumentParser(description='Summarize AiderBench results')
+    parser.add_argument('input_filepath', type=str, help='Path to the JSONL file')
+    args = parser.parse_args()
 
-    json_file_path = sys.argv[1]
-    model_name = sys.argv[2]
+    # Create DataFrame from JSONL file
+    df = pd.read_json(args.input_filepath, lines=True)
 
-    if not os.path.exists(json_file_path):
-        print_error('Output file does not exist!')
-        sys.exit(1)
-    if not os.path.isfile(json_file_path):
-        print_error('Path-to-output-file is not a file!')
-        sys.exit(1)
-
-    output_dir = os.path.dirname(json_file_path)
-    if not os.access(output_dir, os.W_OK):
-        print_error('Output folder is not writable!')
-        sys.exit(1)
-
-    passed_tests, failed_tests = extract_test_results(json_file_path)
-    resolve_rate = visualize_results(json_file_path, model_name, output_dir)
+    passed_tests, failed_tests = extract_test_results(df)
+    resolve_rate = visualize_results(df)
 
     print(
         f'\nPassed {len(passed_tests)} tests, failed {len(failed_tests)} tests, resolve rate = {resolve_rate:.2f}%'
@@ -129,7 +68,3 @@ if __name__ == '__main__':
     print(passed_tests)
     print('FAILED TESTS:')
     print(failed_tests)
-    print(
-        '\nVisualization results were saved as cost_histogram.png and actions_histogram.png'
-    )
-    print('in folder: ', output_dir)