Selaa lähdekoodia

misc: Support folder-level exp analysis for SWE-Bench `summarize_outputs.py`; Handle CrashLoopBackoff for RemoteRuntime (#5385)

Xingyao Wang 1 vuosi sitten
vanhempi
sitoutus
990f277132

+ 186 - 31
evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py

@@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 import argparse
+import glob
 import json
+import os
 from collections import Counter
 
+import pandas as pd
+
 from openhands.events.serialization import event_from_dict
 from openhands.events.utils import get_pairs_from_events
 
@@ -10,25 +14,21 @@ ERROR_KEYWORDS = [
     'Agent encountered an error while processing the last action',
     'APIError',
     'Action execution failed',
+    'litellm.Timeout: APITimeoutError',
 ]
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('output_file', type=str, help='The file to summarize')
-    args = parser.parse_args()
 
-    with open(args.output_file, 'r') as file:
+def process_file(file_path):
+    with open(file_path, 'r') as file:
         lines = file.readlines()
 
     num_lines = len(lines)
     num_error_lines = 0
     num_agent_stuck_in_loop = 0
-
     num_resolved = 0
     num_empty_patch = 0
-
+    num_unfinished_runs = 0
     error_counter = Counter()
-
     main_agent_cost = []
     editor_cost = []
     num_turns = []
@@ -36,6 +36,11 @@ if __name__ == '__main__':
     for line in lines:
         _d = json.loads(line)
 
+        if 'metrics' not in _d or _d['metrics'] is None:
+            # this is a failed run
+            num_unfinished_runs += 1
+            continue
+
         # Cost
         costs = _d['metrics'].get('costs', [])
         _cur_main_agent_cost = 0
@@ -89,30 +94,180 @@ if __name__ == '__main__':
                 num_error_lines += 1
                 break
 
-    # print the error counter (with percentage)
-    print(
-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
-    )
-    print(
-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
+    return {
+        'file_path': file_path,
+        'total_instances': num_lines,
+        'resolved': {
+            'count': num_resolved,
+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'empty_patches': {
+            'count': num_empty_patch,
+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
+        },
+        'unfinished_runs': {
+            'count': num_unfinished_runs,
+            'percentage': (num_unfinished_runs / num_lines * 100)
+            if num_lines > 0
+            else 0,
+        },
+        'errors': {
+            'total': num_error_lines,
+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
+            'stuck_in_loop': {
+                'count': num_agent_stuck_in_loop,
+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
+                if num_lines > 0
+                else 0,
+            },
+            'breakdown': {
+                str(error): {
+                    'count': count,
+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
+                }
+                for error, count in error_counter.items()
+            },
+        },
+        'statistics': {
+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
+            'costs': {
+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
+                if num_lines > 0
+                else 0,
+            },
+        },
+    }
+
+
+def aggregate_directory(input_path) -> pd.DataFrame:
+    # Process all output.jsonl files in subdirectories
+    pattern = os.path.join(input_path, '**/output.jsonl')
+    files = glob.glob(pattern, recursive=True)
+    print(f'Processing {len(files)} files from directory {input_path}')
+
+    # Process each file silently and collect results
+    results = []
+    for file_path in files:
+        try:
+            result = process_file(file_path)
+            results.append(result)
+        except Exception as e:
+            print(f'Error processing {file_path}: {str(e)}')
+            import traceback
+
+            traceback.print_exc()
+            continue
+
+    # Convert results to pandas DataFrame and sort by resolve rate
+    df = pd.DataFrame(results)
+
+    # Extract directory name from file path
+    df['directory'] = df['file_path'].apply(
+        lambda x: os.path.basename(os.path.dirname(x))
     )
-    print(
-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
+
+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
+
+    df = df.sort_values('resolve_rate', ascending=False)
+
+    return df
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'input_path', type=str, help='The file or directory to summarize'
     )
-    assert len(num_turns) == num_lines
-    assert len(main_agent_cost) == num_lines
-    assert len(editor_cost) == num_lines
-    print('## Statistics')
-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
-    print(
-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
+    parser.add_argument(
+        '--output',
+        type=str,
+        help='Output JSONL file for results',
+        default='summary_results.jsonl',
     )
+    args = parser.parse_args()
+
+    if os.path.isdir(args.input_path):
+        df = aggregate_directory(args.input_path)
+        # Create the summary string
+        columns = [
+            'directory',
+            'resolve_rate',
+            'empty_patch_rate',
+            'unfinished_rate',
+            'error_rate',
+            'avg_turns',
+            'avg_cost',
+            'total_instances',
+        ]
+        summary_str = df[columns].to_string(
+            float_format=lambda x: '{:.2f}'.format(x),
+            formatters={
+                'directory': lambda x: x[:90]
+            },  # Truncate directory names to 20 chars
+            index=False,
+        )
+
+        # Print to console
+        print('\nResults summary (sorted by resolve rate):')
+        print(summary_str)
+
+        # Save to text file
+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
+        with open(txt_output, 'w') as f:
+            f.write('Results summary (sorted by resolve rate):\n')
+            f.write(summary_str)
+
+        # Save
+        df.to_json(args.output, lines=True, orient='records')
+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
+    else:
+        # Process single file with detailed output
+        results = []
+        try:
+            result = process_file(args.input_path)
+            results.append(result)
+
+            # Print detailed results for single file
+            print(f'\nResults for {args.input_path}:')
+            print(
+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
+            )
+            print(
+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
+            )
+            print('## Statistics')
+            print(
+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
+            )
+            print(
+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
+            )
+            print(
+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
+            )
+            print(
+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
+            )
+
+            print('## Detailed error breakdown:')
+            for error, data in result['errors']['breakdown'].items():
+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
 
-    print('## Detailed error breakdown:')
-    for error, count in error_counter.items():
-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
+        except Exception as e:
+            print(f'Error processing {args.input_path}: {str(e)}')

+ 1 - 1
openhands/llm/fn_call_converter.py

@@ -431,7 +431,7 @@ def convert_fncall_messages_to_non_fncall_messages(
                     tool_content = convert_tool_call_to_string(message['tool_calls'][0])
                 except FunctionCallConversionError as e:
                     raise FunctionCallConversionError(
-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
                     ) from e
                 if isinstance(content, str):
                     content += '\n\n' + tool_content

+ 6 - 6
openhands/runtime/impl/remote/remote_runtime.py

@@ -336,13 +336,13 @@ class RemoteRuntime(Runtime):
         assert 'runtime_id' in runtime_data
         assert runtime_data['runtime_id'] == self.runtime_id
         assert 'pod_status' in runtime_data
-        pod_status = runtime_data['pod_status']
+        pod_status = runtime_data['pod_status'].lower()
         self.log('debug', f'Pod status: {pod_status}')
 
         # FIXME: We should fix it at the backend of /start endpoint, make sure
         # the pod is created before returning the response.
         # Retry a period of time to give the cluster time to start the pod
-        if pod_status == 'Ready':
+        if pod_status == 'ready':
             try:
                 with self._send_request(
                     'GET',
@@ -358,14 +358,14 @@ class RemoteRuntime(Runtime):
                 )
             return
         elif (
-            pod_status == 'Not Found'
-            or pod_status == 'Pending'
-            or pod_status == 'Running'
+            pod_status == 'not found'
+            or pod_status == 'pending'
+            or pod_status == 'running'
         ):  # nb: Running is not yet Ready
             raise RuntimeNotReadyError(
                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
             )
-        elif pod_status in ('Failed', 'Unknown'):
+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
             # clean up the runtime
             self.close()
             raise RuntimeError(

+ 1 - 1
openhands/server/session/manager.py

@@ -63,7 +63,7 @@ class SessionManager:
                     await self._process_message(message)
             except asyncio.CancelledError:
                 return
-            except:
+            except Exception:
                 try:
                     asyncio.get_running_loop()
                     logger.warning(