| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164 |
- import json
- import os
- import pprint
- import tqdm
- from openhands.core.config import get_llm_config_arg, get_parser, load_app_config
- from openhands.core.logger import openhands_logger as logger
- from openhands.llm.llm import LLM
- config = load_app_config()
- def extract_test_results(res_file_path: str) -> tuple[list[str], list[str]]:
- passed = []
- failed = []
- costs = []
- instance_ids = set()
- instances = []
- with open(res_file_path, 'r') as file:
- for line in file:
- data = json.loads(line.strip())
- success = data['metrics']['success']
- if data['instance_id'] in instance_ids:
- print(f'WARNING: Duplicate instance_id found: {data["instance_id"]}')
- continue
- instance_ids.add(data['instance_id'])
- instances.append(data)
- if success:
- passed.append(
- {
- 'instance_id': data['instance_id'],
- 'repo': data['repo'],
- 'instruction': data['instruction'],
- 'eval_script': data['eval_script'],
- 'eval_exit_code': data['eval_exit_code'],
- 'eval_output': data['eval_output'],
- 'accumulated_cost': data['metrics']['accumulated_cost'],
- }
- )
- else:
- failed.append(
- {
- 'instance_id': data['instance_id'],
- 'repo': data['repo'],
- 'instruction': data['instruction'],
- 'metadata': data['metadata'],
- 'history': data['history'],
- 'eval_script': data['eval_script'],
- 'eval_exit_code': data['eval_exit_code'],
- 'eval_output': data['eval_output'],
- 'accumulated_cost': data['metrics']['accumulated_cost'],
- }
- )
- costs.append(data['metrics']['accumulated_cost'])
- # sort by instance_id
- instances.sort(key=lambda x: x['instance_id'])
- with open(res_file_path, 'w') as file:
- for instance in instances:
- file.write(json.dumps(instance) + '\n')
- return passed, failed, costs
- def classify_error(llm: LLM, failed_case: dict) -> str:
- prompt = f"""
- Please classify the error for the following failed case based on the history and eval_output:
- Instruction:
- {failed_case['instruction']}
- Eval Script:
- {failed_case['eval_script']}s
- History:
- {failed_case['history']}
- Eval Output:
- {failed_case['eval_output']}
- The error categories are:
- E1: Hallucination Errors - The model misinterpreted the user's intention, misplaced Python code and bash script, or generated random or irrelevant code.
- E2: Lack of Knowledge or Information - The model lacks sufficient information or domain-specific knowledge to satisfy the user's requirements.
- E3: Knowledge Manipulation - The model failed to integrate or manipulate information properly.
- E4: Syntax Errors - The model generated code with syntax errors.
- E5: Operational Error - The model gave up easily or exited without finishing the tasks.
- Please provide only the error category (E1, E2, E3, E4, or E5) without any explanation.
- """
- try:
- response = llm.completion(messages=[{'content': prompt, 'role': 'user'}])
- error_category = response.choices[0].message['content']
- except Exception as e:
- logger.error(
- f"Failed to classify the error for the failed case: {failed_case['instance_id']}"
- )
- logger.error(e)
- error_category = input(
- failed_case['instruction']
- + ': '
- + failed_case['eval_script']
- + ' - '
- + failed_case['eval_output']
- )
- if error_category not in ['E1', 'E2', 'E3', 'E4', 'E5']:
- raise ValueError(f'Invalid error category: {error_category}')
- return error_category
- if __name__ == '__main__':
- parser = get_parser()
- parser.add_argument(
- '--json_file_path',
- type=str,
- required=True,
- help='Path to the jsonl file containing the evaluation results',
- )
- args, _ = parser.parse_known_args()
- # Check https://github.com/All-Hands-AI/OpenHands/blob/main/evaluation/swe_bench/README.md#configure-openhands-and-your-llm
- # for details of how to set `llm_config`
- if args.llm_config:
- specified_llm_config = get_llm_config_arg(args.llm_config)
- if specified_llm_config:
- config.llm = specified_llm_config
- logger.info(f'Config for evaluation: {config}')
- llm = LLM(llm_config=specified_llm_config)
- passed, new_failed, costs = extract_test_results(args.json_file_path)
- failed = []
- if os.path.exists(args.json_file_path.replace('.jsonl', '_failed.jsonl')):
- with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'r') as file:
- for line in file:
- failed.append(json.loads(line.strip()))
- print(
- f'Loaded {len(failed)} failed cases from {args.json_file_path.replace(".jsonl", "_failed.jsonl")}'
- )
- for failed_case in tqdm.tqdm(new_failed):
- if failed_case['instance_id'] in [case['instance_id'] for case in failed]:
- continue
- error_category = classify_error(llm, failed_case)
- failed_case['error_category'] = error_category
- failed.append(failed_case)
- with open(args.json_file_path.replace('.jsonl', '_failed.jsonl'), 'a') as file:
- file.write(json.dumps(failed_case) + '\n')
- # Print the summary
- print('Summary:')
- print(f'Passed: {len(passed)}')
- print(f'Failed: {len(failed)}')
- print(f'Costs: {costs}')
- print('Failed cases:')
- error_categories = {}
- for case in failed:
- error_category = case['error_category']
- if error_category not in error_categories:
- error_categories[error_category] = 0
- error_categories[error_category] += 1
- pprint.pprint(error_categories)
|