|
|
@@ -0,0 +1,316 @@
|
|
|
+###########################################################################################################
|
|
|
+# Adapted from https://github.com/TheAgentCompany/TheAgentCompany/blob/main/evaluation/summarise_results.py
|
|
|
+###########################################################################################################
|
|
|
+
|
|
|
+
|
|
|
+import glob
|
|
|
+import json
|
|
|
+import os
|
|
|
+import re
|
|
|
+import sys
|
|
|
+from typing import Dict, Tuple
|
|
|
+
|
|
|
+
|
|
|
+def calculate_cost(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
|
+ """
|
|
|
+ Calculate the cost of the model call.
|
|
|
+ """
|
|
|
+ if 'claude-3-5-sonnet' in model.lower():
|
|
|
+ # https://www.anthropic.com/pricing#anthropic-api, accessed 12/11/2024
|
|
|
+ return 0.000003 * prompt_tokens + 0.000015 * completion_tokens
|
|
|
+ elif 'gpt-4o' in model.lower():
|
|
|
+ # https://openai.com/api/pricing/, accessed 12/11/2024
|
|
|
+ return 0.0000025 * prompt_tokens + 0.00001 * completion_tokens
|
|
|
+ elif 'gemini-1.5-pro' in model.lower():
|
|
|
+ # https://ai.google.dev/pricing#1_5pro, accessed 12/11/2024
|
|
|
+ # assuming prompts up to 128k tokens
|
|
|
+ cost = 0.00000125 * prompt_tokens + 0.000005 * completion_tokens
|
|
|
+ if prompt_tokens > 128000:
|
|
|
+ cost *= 2
|
|
|
+ return cost
|
|
|
+ elif 'gemini-2.0-flash-exp' in model.lower():
|
|
|
+ # price unknown for gemini-2.0-flash-exp, assuming same price as gemini-1.5-flash
|
|
|
+ cost = 0.000000075 * prompt_tokens + 0.0000003 * completion_tokens
|
|
|
+ if prompt_tokens > 128000:
|
|
|
+ cost *= 2
|
|
|
+ return cost
|
|
|
+ elif 'qwen2-72b' in model.lower():
|
|
|
+ # assuming hosted on Together
|
|
|
+ # https://www.together.ai/pricing, accessed 12/11/2024
|
|
|
+ return 0.0000009 * (prompt_tokens + completion_tokens)
|
|
|
+ elif 'qwen2p5-72b' in model.lower():
|
|
|
+ # assuming hosted on Together
|
|
|
+ # https://www.together.ai/pricing, accessed 12/14/2024
|
|
|
+ return 0.0000012 * (prompt_tokens + completion_tokens)
|
|
|
+ elif 'llama-v3p1-405b-instruct' in model.lower():
|
|
|
+ # assuming hosted on Fireworks AI
|
|
|
+ # https://fireworks.ai/pricing, accessed 12/11/2024
|
|
|
+ return 0.000003 * (prompt_tokens + completion_tokens)
|
|
|
+ elif 'llama-v3p1-70b-instruct' in model.lower():
|
|
|
+ # assuming hosted on Fireworks AI
|
|
|
+ return 0.0000009 * (prompt_tokens + completion_tokens)
|
|
|
+ elif 'llama-v3p3-70b-instruct' in model.lower():
|
|
|
+ # assuming hosted on Fireworks AI
|
|
|
+ return 0.0000009 * (prompt_tokens + completion_tokens)
|
|
|
+ elif 'amazon.nova-pro-v1:0' in model.lower():
|
|
|
+ # assuming hosted on Amazon Bedrock
|
|
|
+ # https://aws.amazon.com/bedrock/pricing/, accessed 12/11/2024
|
|
|
+ return 0.0000008 * prompt_tokens + 0.0000032 * completion_tokens
|
|
|
+ else:
|
|
|
+ raise ValueError(f'Unknown model: {model}')
|
|
|
+
|
|
|
+
|
|
|
+def analyze_eval_json_file(filepath: str) -> Tuple[int, int]:
|
|
|
+ """
|
|
|
+ Analyze a single eval JSON file and extract the total and result from final_score.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ filepath: Path to the JSON file
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Tuple containing (total, result) from final_score
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ with open(filepath, 'r') as f:
|
|
|
+ data = json.load(f)
|
|
|
+
|
|
|
+ final_score = data.get('final_score', {})
|
|
|
+ return (final_score.get('total', 0), final_score.get('result', 0))
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ print(f'Error decoding JSON in {filepath}: {e}')
|
|
|
+ return (0, 0)
|
|
|
+ except Exception as e:
|
|
|
+ print(f'Error processing {filepath}: {e}')
|
|
|
+ return (0, 0)
|
|
|
+
|
|
|
+
|
|
|
+def analyze_traj_json_file(filepath: str) -> Tuple[int, float]:
|
|
|
+ """
|
|
|
+ Analyze a single trajectory JSON file and extract the steps and tokens
|
|
|
+ for each step. Then estimate the cost based on the tokens and the model type.
|
|
|
+ Note: this is assuming there's no prompt caching at all.
|
|
|
+ """
|
|
|
+ steps: int = 0
|
|
|
+ cost: float = 0.0
|
|
|
+ with open(filepath, 'r') as f:
|
|
|
+ data = json.load(f)
|
|
|
+ response_id = None
|
|
|
+ for action in data:
|
|
|
+ if 'tool_call_metadata' in action:
|
|
|
+ if action['tool_call_metadata']['model_response']['id'] != response_id:
|
|
|
+ response_id = action['tool_call_metadata']['model_response']['id']
|
|
|
+ else:
|
|
|
+ # openhands displays the same model response meta data multiple times, when
|
|
|
+ # a single LLM call leads to multiple actions and observations.
|
|
|
+ continue
|
|
|
+ steps += 1
|
|
|
+ usage = action['tool_call_metadata']['model_response']['usage']
|
|
|
+ model: str = action['tool_call_metadata']['model_response']['model']
|
|
|
+ prompt_tokens = usage['prompt_tokens']
|
|
|
+ completion_tokens = usage['completion_tokens']
|
|
|
+ cost += calculate_cost(model, prompt_tokens, completion_tokens)
|
|
|
+
|
|
|
+ return (steps, cost)
|
|
|
+
|
|
|
+
|
|
|
+def analyze_folder(
|
|
|
+ folder_path: str,
|
|
|
+) -> Tuple[Dict[str, Tuple[int, int]], Dict[str, Tuple[int, float]]]:
|
|
|
+ """
|
|
|
+ Analyze all eval_*.json & traj_*.json files in the specified folder.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ folder_path: Path to the folder containing JSON files
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ dictionaries:
|
|
|
+ - eval_results: Dictionary with filename as key and (total, result) tuple as value
|
|
|
+ - traj_results: Dictionary with filename as key and (steps, cost) tuple as value
|
|
|
+ """
|
|
|
+ eval_results = {}
|
|
|
+ traj_results = {}
|
|
|
+
|
|
|
+ eval_pattern = os.path.join(folder_path, 'eval_*.json')
|
|
|
+ traj_pattern = os.path.join(folder_path, 'traj_*.json')
|
|
|
+
|
|
|
+ for filepath in glob.glob(eval_pattern):
|
|
|
+ filename = os.path.basename(filepath)
|
|
|
+ total, result = analyze_eval_json_file(filepath)
|
|
|
+ key = re.search(r'eval_(.+)\.json', filename).group(1)
|
|
|
+ eval_results[key] = (total, result)
|
|
|
+
|
|
|
+ for filepath in glob.glob(traj_pattern):
|
|
|
+ filename = os.path.basename(filepath)
|
|
|
+ steps, cost = analyze_traj_json_file(filepath)
|
|
|
+ key = re.search(r'traj_(.+)\.json', filename).group(1)
|
|
|
+ traj_results[key] = (steps, cost)
|
|
|
+
|
|
|
+ return eval_results, traj_results
|
|
|
+
|
|
|
+
|
|
|
+def get_task_nature_category(task_name: str) -> str:
|
|
|
+ """
|
|
|
+ Get the nature category of the task.
|
|
|
+ """
|
|
|
+ task_nature = task_name.split('-')[0]
|
|
|
+ if task_nature.lower() in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance']:
|
|
|
+ return task_nature
|
|
|
+ else:
|
|
|
+ return 'other'
|
|
|
+
|
|
|
+
|
|
|
+def calculate_score(total: int, result: int) -> float:
|
|
|
+ """
|
|
|
+ Calculate the score as a number between 0 and 1.
|
|
|
+
|
|
|
+ Formula: score = (result / total) * 0.5 + (result // total) * 0.5
|
|
|
+ Explanation:
|
|
|
+ - (result / total) * 0.5: This is the completion ratio, scaled down to a 0-0.5 range.
|
|
|
+ - (result // total) * 0.5: This is a binary score indicating whether the task was completed or not.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ total: Total possible points
|
|
|
+ result: Actual points achieved
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Score as a number between 0 and 1
|
|
|
+ """
|
|
|
+ return (result / total * 0.5) + (result // total * 0.5)
|
|
|
+
|
|
|
+
|
|
|
+def is_perfect_completion(total: int, result: int) -> bool:
|
|
|
+ """
|
|
|
+ Check if the task achieved perfect completion.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ total: Total possible points
|
|
|
+ result: Actual points achieved
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ True if result equals total, False otherwise
|
|
|
+ """
|
|
|
+ return total > 0 and total == result
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ if len(sys.argv) != 2:
|
|
|
+ print('Usage: poetry run python summarise_results.py <folder_path>')
|
|
|
+ sys.exit(1)
|
|
|
+
|
|
|
+ folder_path = sys.argv[1]
|
|
|
+
|
|
|
+ if not os.path.isdir(folder_path):
|
|
|
+ print(f"Error: '{folder_path}' is not a valid directory")
|
|
|
+ sys.exit(1)
|
|
|
+
|
|
|
+ eval_results, traj_results = analyze_folder(folder_path)
|
|
|
+
|
|
|
+ if not eval_results:
|
|
|
+ print(f'No eval_*.json files found in {folder_path}')
|
|
|
+ return
|
|
|
+
|
|
|
+ # Create list of results with completion ratios for sorting
|
|
|
+ detailed_results = [
|
|
|
+ (
|
|
|
+ task_name,
|
|
|
+ total,
|
|
|
+ result,
|
|
|
+ calculate_score(total, result),
|
|
|
+ is_perfect_completion(total, result),
|
|
|
+ get_task_nature_category(task_name),
|
|
|
+ )
|
|
|
+ for task_name, (total, result) in eval_results.items()
|
|
|
+ ]
|
|
|
+
|
|
|
+ # Sort by score in descending order
|
|
|
+ detailed_results.sort(key=lambda x: (-x[3], x[0]))
|
|
|
+
|
|
|
+ # Calculate perfect completion stats
|
|
|
+ perfect_completions = sum(
|
|
|
+ 1 for _, _, _, _, is_perfect, _ in detailed_results if is_perfect
|
|
|
+ )
|
|
|
+
|
|
|
+ # Print header
|
|
|
+ print('\n# Evaluation Results Report')
|
|
|
+ print('\n## Results per File')
|
|
|
+ print('\n*Sorted by score (⭐ indicates perfect completion)*\n')
|
|
|
+
|
|
|
+ # Print table header
|
|
|
+ print(
|
|
|
+ '| Filename | Total | Result | Score | Steps | Cost (assuming no prompt caching)|'
|
|
|
+ )
|
|
|
+ print('|----------|--------|---------|-------|-------|------|')
|
|
|
+
|
|
|
+ # Print individual file results
|
|
|
+ for task_name, total, result, score, is_perfect, task_nature in detailed_results:
|
|
|
+ perfect_marker = ' ⭐' if is_perfect else ''
|
|
|
+ print(
|
|
|
+ f'| {task_name} | {total:,} | {result:,} | {score:.2f}{perfect_marker} | {traj_results[task_name][0]} | {traj_results[task_name][1]:.2f} |'
|
|
|
+ )
|
|
|
+
|
|
|
+ # Print summary section
|
|
|
+ print('\n## Summary\n')
|
|
|
+ print(f'**Tasks Evaluated:** {len(eval_results)}\n')
|
|
|
+ print(
|
|
|
+ f'**Perfect Completions:** {perfect_completions}/{len(eval_results)} ({(perfect_completions/len(eval_results)*100):.2f}%)\n'
|
|
|
+ )
|
|
|
+
|
|
|
+ overall_score = (
|
|
|
+ sum(score for _, _, _, score, _, _ in detailed_results)
|
|
|
+ / len(detailed_results)
|
|
|
+ * 100
|
|
|
+ )
|
|
|
+ avg_steps = sum(steps for steps, _ in traj_results.values()) / len(traj_results)
|
|
|
+ avg_cost = sum(cost for _, cost in traj_results.values()) / len(traj_results)
|
|
|
+ print(f'**Overall Score:** {overall_score:.2f}%\n')
|
|
|
+ print(f'**Average Steps:** {avg_steps:.2f}\n')
|
|
|
+ print(f'**Average Cost (USD):** {avg_cost:.2f}\n')
|
|
|
+
|
|
|
+ # Additional statistics
|
|
|
+ if detailed_results:
|
|
|
+ highest_score = max(score for _, _, _, score, _, _ in detailed_results)
|
|
|
+ lowest_score = min(score for _, _, _, score, _, _ in detailed_results)
|
|
|
+ median_score = detailed_results[len(detailed_results) // 2][3]
|
|
|
+ avg_score = sum(score for _, _, _, score, _, _ in detailed_results) / len(
|
|
|
+ detailed_results
|
|
|
+ )
|
|
|
+
|
|
|
+ print('\n## Statistics\n')
|
|
|
+ print('| Metric | Value |')
|
|
|
+ print('|---------|--------|')
|
|
|
+ print(f'| Highest Task Score | {highest_score*100:.2f}% |')
|
|
|
+ print(f'| Lowest Task Score | {lowest_score*100:.2f}% |')
|
|
|
+ print(f'| Median Task Score | {median_score*100:.2f}% |')
|
|
|
+ print(f'| Average Task Score | {avg_score*100:.2f}% |')
|
|
|
+
|
|
|
+ # compute avg score per nature category
|
|
|
+ print('\n## Statistics per Nature Category\n')
|
|
|
+ print('| Metric | Value |')
|
|
|
+ print('|---------|--------|')
|
|
|
+ for task_nature in ['sde', 'pm', 'ds', 'admin', 'hr', 'finance', 'other']:
|
|
|
+ num_of_tasks = sum(
|
|
|
+ 1
|
|
|
+ for _, _, _, _, _, nature_category in detailed_results
|
|
|
+ if nature_category == task_nature
|
|
|
+ )
|
|
|
+ task_nature_score = (
|
|
|
+ sum(
|
|
|
+ score
|
|
|
+ for _, _, _, score, _, nature_category in detailed_results
|
|
|
+ if nature_category == task_nature
|
|
|
+ )
|
|
|
+ / num_of_tasks
|
|
|
+ )
|
|
|
+ perfect_completions = sum(
|
|
|
+ 1
|
|
|
+ for _, _, _, _, is_perfect, nature_category in detailed_results
|
|
|
+ if nature_category == task_nature and is_perfect
|
|
|
+ )
|
|
|
+ print(
|
|
|
+ f'| Perfect Completions for {task_nature} | {perfect_completions}/{num_of_tasks} ({perfect_completions/num_of_tasks*100:.2f}%) |'
|
|
|
+ )
|
|
|
+ print(f'| Average Score for {task_nature} | {task_nature_score*100:.2f}% |')
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|