1 jaar geleden · b11e905988
--- a/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/verify_costs.py
@@ -0,0 +1,104 @@
 
				+import argparse
			
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+from openhands.core.logger import openhands_logger as logger
			
 
				+
			
 
				+
			
 
				+def verify_instance_costs(row: pd.Series) -> float:
			
 
				+    """
			
 
				+    Verifies that the accumulated_cost matches the sum of individual costs in metrics.
			
 
				+    Also checks for duplicate consecutive costs which might indicate buggy counting.
			
 
				+    If the consecutive costs are identical, the file is affected by this bug:
			
 
				+    https://github.com/All-Hands-AI/OpenHands/issues/5383
			
 
				+
			
 
				+    Args:
			
 
				+        row: DataFrame row containing instance data with metrics
			
 
				+    Returns:
			
 
				+        float: The verified total cost for this instance (corrected if needed)
			
 
				+    """
			
 
				+    try:
			
 
				+        metrics = row.get('metrics')
			
 
				+        if not metrics:
			
 
				+            logger.warning(f"Instance {row['instance_id']}: No metrics found")
			
 
				+            return 0.0
			
 
				+
			
 
				+        accumulated = metrics.get('accumulated_cost')
			
 
				+        costs = metrics.get('costs', [])
			
 
				+
			
 
				+        if accumulated is None:
			
 
				+            logger.warning(
			
 
				+                f"Instance {row['instance_id']}: No accumulated_cost in metrics"
			
 
				+            )
			
 
				+            return 0.0
			
 
				+
			
 
				+        # Check for duplicate consecutive costs and systematic even-odd pairs
			
 
				+        has_duplicate = False
			
 
				+        all_pairs_match = True
			
 
				+
			
 
				+        # Check each even-odd pair (0-1, 2-3, etc.)
			
 
				+        for i in range(0, len(costs) - 1, 2):
			
 
				+            if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
			
 
				+                has_duplicate = True
			
 
				+                logger.debug(
			
 
				+                    f"Instance {row['instance_id']}: Possible buggy double-counting detected! "
			
 
				+                    f"Steps {i} and {i+1} have identical costs: {costs[i]['cost']:.2f}"
			
 
				+                )
			
 
				+            else:
			
 
				+                all_pairs_match = False
			
 
				+                break
			
 
				+
			
 
				+        # Calculate total cost, accounting for buggy double counting if detected
			
 
				+        if len(costs) >= 2 and has_duplicate and all_pairs_match:
			
 
				+            paired_steps_cost = sum(
			
 
				+                cost_entry['cost']
			
 
				+                for cost_entry in costs[: -1 if len(costs) % 2 else None]
			
 
				+            )
			
 
				+            real_paired_cost = paired_steps_cost / 2
			
 
				+
			
 
				+            unpaired_cost = costs[-1]['cost'] if len(costs) % 2 else 0
			
 
				+            total_cost = real_paired_cost + unpaired_cost
			
 
				+
			
 
				+        else:
			
 
				+            total_cost = sum(cost_entry['cost'] for cost_entry in costs)
			
 
				+
			
 
				+        if not abs(total_cost - accumulated) < 1e-6:
			
 
				+            logger.warning(
			
 
				+                f"Instance {row['instance_id']}: Cost mismatch: "
			
 
				+                f"accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, "
			
 
				+            )
			
 
				+
			
 
				+        return total_cost
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(
			
 
				+            f"Error verifying costs for instance {row.get('instance_id', 'UNKNOWN')}: {e}"
			
 
				+        )
			
 
				+        return 0.0
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    parser = argparse.ArgumentParser(
			
 
				+        description='Verify costs in SWE-bench output file'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        'input_filepath', type=str, help='Path to the output.jsonl file'
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    try:
			
 
				+        # Load and verify the JSONL file
			
 
				+        df = pd.read_json(args.input_filepath, lines=True)
			
 
				+        logger.info(f'Loaded {len(df)} instances from {args.input_filepath}')
			
 
				+
			
 
				+        # Verify costs for each instance and sum up total
			
 
				+        total_cost = df.apply(verify_instance_costs, axis=1).sum()
			
 
				+        logger.info(f'Total verified cost across all instances: ${total_cost:.2f}')
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        logger.error(f'Failed to process file: {e}')
			
 
				+        raise
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()