1 year ago · 563bc41fd3
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@@ -53,6 +53,26 @@ To run the evaluation on the ML-Bench dataset, use the following command:
 
				 
			
 
				 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
			
 
				 
			
 
				+## Score Evaluation Output
			
 
				+
			
 
				+To score the evaluation output, use the following command:
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/ml_bench/scripts/summarise_results.py [eval_output_dir]
			
 
				+# e.g., ./evaluation/ml_bench/scripts/summarise_results.py evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5
			
 
				+```
			
 
				+
			
 
				+## Run Error Analysis on ML-Bench
			
 
				+
			
 
				+To run error analysis on the ML-Bench dataset, use the following command:
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/ml_bench/scripts/run_analysis.sh [eval_output_dir] [model_config]
			
 
				+# e.g., ./evaluation/ml_bench/scripts/run_analysis.sh evaluation/evaluation_outputs/outputs/ml_bench/CodeActAgent/gpt-4-1106-preview_maxiter_10_N_v1.5/output.jsonl eval_gpt4_1106_preview
			
 
				+```
			
 
				+
			
 
				+This command generates a report on the evaluation output and provides insights into the agent's performance.
			
 
				+
			
 
				 ## Examples
			
 
				 
			
 
				 For each task in the ML-Bench dataset, OpenDevin provides the agent with a set number of iterations to complete the task. The `history` field in the evaluation output shows each iteration's response and actions taken by the agent to complete the task.
			
--- a/evaluation/ml_bench/run_analysis.py
+++ b/evaluation/ml_bench/run_analysis.py
@@ -0,0 +1,162 @@
 
				+import json
			
 
				+import os
			
 
				+import pprint
			
 
				+
			
 
				+import tqdm
			
 
				+
			
 
				+from opendevin.core.config import config, get_llm_config_arg, get_parser
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.llm.llm import LLM
			
 
				+
			
 
				+
			
 
				+def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
			
 
				+    passed = []
			
 
				+    failed = []
			
 
				+    costs = []
			
 
				+    instance_ids = set()
			
 
				+    instances = []
			
 
				+    with open(res_file_path, 'r') as file:
			
 
				+        for line in file:
			
 
				+            data = json.loads(line.strip())
			
 
				+            success = data['metrics']['success']
			
 
				+            if data['instance_id'] in instance_ids:
			
 
				+                print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
			
 
				+                continue
			
 
				+            instance_ids.add(data['instance_id'])
			
 
				+            instances.append(data)
			
 
				+            if success:
			
 
				+                passed.append(
			
 
				+                    {
			
 
				+                        'instance_id': data['instance_id'],
			
 
				+                        'repo': data['repo'],
			
 
				+                        'instruction': data['instruction'],
			
 
				+                        'eval_script': data['eval_script'],
			
 
				+                        'eval_exit_code': data['eval_exit_code'],
			
 
				+                        'eval_output': data['eval_output'],
			
 
				+                        'accumulated_cost': data['metrics']['accumulated_cost'],
			
 
				+                    }
			
 
				+                )
			
 
				+            else:
			
 
				+                failed.append(
			
 
				+                    {
			
 
				+                        'instance_id': data['instance_id'],
			
 
				+                        'repo': data['repo'],
			
 
				+                        'instruction': data['instruction'],
			
 
				+                        'metadata': data['metadata'],
			
 
				+                        'history': data['history'],
			
 
				+                        'eval_script': data['eval_script'],
			
 
				+                        'eval_exit_code': data['eval_exit_code'],
			
 
				+                        'eval_output': data['eval_output'],
			
 
				+                        'accumulated_cost': data['metrics']['accumulated_cost'],
			
 
				+                    }
			
 
				+                )
			
 
				+            costs.append(data['metrics']['accumulated_cost'])
			
 
				+
			
 
				+        # sort by instance_id
			
 
				+        instances.sort(key=lambda x: x['instance_id'])
			
 
				+        with open(res_file_path, 'w') as file:
			
 
				+            for instance in instances:
			
 
				+                file.write(json.dumps(instance) + '\n')
			
 
				+        return passed, failed, costs
			
 
				+
			
 
				+
			
 
				+def classify_error(llm: LLM, failed_case: dict) -> str:
			
 
				+    prompt = f"""
			
 
				+    Please classify the error for the following failed case based on the history and eval_output:
			
 
				+
			
 
				+    Instruction:
			
 
				+    {failed_case['instruction']}
			
 
				+
			
 
				+    Eval Script:
			
 
				+    {failed_case['eval_script']}s
			
 
				+
			
 
				+    History:
			
 
				+    {failed_case['history']}
			
 
				+
			
 
				+    Eval Output:
			
 
				+    {failed_case['eval_output']}
			
 
				+
			
 
				+    The error categories are:
			
 
				+    E1: Hallucination Errors - The model misinterpreted the user's intention, misplaced Python code and bash script, or generated random or irrelevant code.
			
 
				+    E2: Lack of Knowledge or Information - The model lacks sufficient information or domain-specific knowledge to satisfy the user's requirements.
			
 
				+    E3: Knowledge Manipulation - The model failed to integrate or manipulate information properly.
			
 
				+    E4: Syntax Errors - The model generated code with syntax errors.
			
 
				+    E5: Operational Error - The model gave up easily or exited without finishing the tasks.
			
 
				+
			
 
				+    Please provide only the error category (E1, E2, E3, E4, or E5) without any explanation.
			
 
				+    """
			
 
				+
			
 
				+    try:
			
 
				+        response = llm.completion(messages=[{'content': prompt, 'role': 'user'}])
			
 
				+        error_category = response.choices[0].message['content']
			
 
				+    except Exception as e:
			
 
				+        logger.error(
			
 
				+            f"Failed to classify the error for the failed case: {failed_case['instance_id']}"
			
 
				+        )
			
 
				+        logger.error(e)
			
 
				+        error_category = input(
			
 
				+            failed_case['instruction']
			
 
				+            + ': '
			
 
				+            + failed_case['eval_script']
			
 
				+            + ' - '
			
 
				+            + failed_case['eval_output']
			
 
				+        )
			
 
				+
			
 
				+    if error_category not in ['E1', 'E2', 'E3', 'E4', 'E5']:
			
 
				+        raise ValueError(f'Invalid error category: {error_category}')
			
 
				+
			
 
				+    return error_category
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--json_file_path',
			
 
				+        type=str,
			
 
				+        required=True,
			
 
				+        help='Path to the jsonl file containing the evaluation results',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				+    # for details of how to set `llm_config`
			
 
				+    if args.llm_config:
			
 
				+        specified_llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        if specified_llm_config:
			
 
				+            config.llm = specified_llm_config
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+    llm = LLM(llm_config=specified_llm_config)
			
 
				+
			
 
				+    passed, new_failed, costs = extract_test_results(args.json_file_path)
			
 
				+
			
 
				+    failed = []
			
 
				+    if os.path.exists(args.json_file_path.replace('.jsonl', '_failed.jsonl')):
			
 
				+        with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'r') as file:
			
 
				+            for line in file:
			
 
				+                failed.append(json.loads(line.strip()))
			
 
				+        print(
			
 
				+            f'Loaded {len(failed)} failed cases from {args.json_file_path.replace(".jsonl", "_failed.jsonl")}'
			
 
				+        )
			
 
				+
			
 
				+    for failed_case in tqdm.tqdm(new_failed):
			
 
				+        if failed_case['instance_id'] in [case['instance_id'] for case in failed]:
			
 
				+            continue
			
 
				+        error_category = classify_error(llm, failed_case)
			
 
				+        failed_case['error_category'] = error_category
			
 
				+        failed.append(failed_case)
			
 
				+        with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'a') as file:
			
 
				+            file.write(json.dumps(failed_case) + '\n')
			
 
				+
			
 
				+    # Print the summary
			
 
				+    print('Summary:')
			
 
				+    print(f'Passed: {len(passed)}')
			
 
				+    print(f'Failed: {len(failed)}')
			
 
				+    print(f'Costs: {costs}')
			
 
				+    print('Failed cases:')
			
 
				+    error_categories = {}
			
 
				+    for case in failed:
			
 
				+        error_category = case['error_category']
			
 
				+        if error_category not in error_categories:
			
 
				+            error_categories[error_category] = 0
			
 
				+        error_categories[error_category] += 1
			
 
				+    pprint.pprint(error_categories)
			
--- a/evaluation/ml_bench/scripts/run_analysis.sh
+++ b/evaluation/ml_bench/scripts/run_analysis.sh
@@ -0,0 +1,25 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+RESULT_FILE=$1
			
 
				+MODEL_CONFIG=$2
			
 
				+
			
 
				+if [ -z "$RESULT_FILE" ]; then
			
 
				+  echo "RESULT_FILE not specified"
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+
			
 
				+if [ -z "$MODEL_CONFIG" ]; then
			
 
				+  echo "Model config not specified, use default"
			
 
				+  MODEL_CONFIG="eval_gpto"
			
 
				+fi
			
 
				+
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+echo "RESULT_FILE: $RESULT_FILE"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/ml_bench/run_analysis.py \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --json_file_path $RESULT_FILE"
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND