1 year ago · 990f277132
--- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@@ -1,8 +1,12 @@
 
															 #!/usr/bin/env python3
														
 
															 import argparse
														
 
															+import glob
														
 
															 import json
														
 
															+import os
														
 
															 from collections import Counter
														
 
															+import pandas as pd
														
 
															+
														
 
															 from openhands.events.serialization import event_from_dict
														
 
															 from openhands.events.utils import get_pairs_from_events
														
@@ -10,25 +14,21 @@ ERROR_KEYWORDS = [
 
															     'Agent encountered an error while processing the last action',
														
 
															     'APIError',
														
 
															     'Action execution failed',
														
 
															+    'litellm.Timeout: APITimeoutError',
														
 
															 ]
														
 
															-if __name__ == '__main__':
														
 
															-    parser = argparse.ArgumentParser()
														
 
															-    parser.add_argument('output_file', type=str, help='The file to summarize')
														
 
															-    args = parser.parse_args()
														
 
															-    with open(args.output_file, 'r') as file:
														
 
															+def process_file(file_path):
														
 
															+    with open(file_path, 'r') as file:
														
 
															         lines = file.readlines()
														
 
															     num_lines = len(lines)
														
 
															     num_error_lines = 0
														
 
															     num_agent_stuck_in_loop = 0
														
 
															-
														
 
															     num_resolved = 0
														
 
															     num_empty_patch = 0
														
 
															-
														
 
															+    num_unfinished_runs = 0
														
 
															     error_counter = Counter()
														
 
															-
														
 
															     main_agent_cost = []
														
 
															     editor_cost = []
														
 
															     num_turns = []
														
@@ -36,6 +36,11 @@ if __name__ == '__main__':
 
															     for line in lines:
														
 
															         _d = json.loads(line)
														
 
															+        if 'metrics' not in _d or _d['metrics'] is None:
														
 
															+            # this is a failed run
														
 
															+            num_unfinished_runs += 1
														
 
															+            continue
														
 
															+
														
 
															         # Cost
														
 
															         costs = _d['metrics'].get('costs', [])
														
 
															         _cur_main_agent_cost = 0
														
@@ -89,30 +94,180 @@ if __name__ == '__main__':
 
															                 num_error_lines += 1
														
 
															                 break
														
 
															-    # print the error counter (with percentage)
														
 
															-    print(
														
 
															-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
														
 
															-    )
														
 
															-    print(
														
 
															-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
														
 
															-    )
														
 
															-    print(
														
 
															-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
														
 
															+    return {
														
 
															+        'file_path': file_path,
														
 
															+        'total_instances': num_lines,
														
 
															+        'resolved': {
														
 
															+            'count': num_resolved,
														
 
															+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
														
 
															+        },
														
 
															+        'empty_patches': {
														
 
															+            'count': num_empty_patch,
														
 
															+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
														
 
															+        },
														
 
															+        'unfinished_runs': {
														
 
															+            'count': num_unfinished_runs,
														
 
															+            'percentage': (num_unfinished_runs / num_lines * 100)
														
 
															+            if num_lines > 0
														
 
															+            else 0,
														
 
															+        },
														
 
															+        'errors': {
														
 
															+            'total': num_error_lines,
														
 
															+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
														
 
															+            'stuck_in_loop': {
														
 
															+                'count': num_agent_stuck_in_loop,
														
 
															+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
														
 
															+                if num_lines > 0
														
 
															+                else 0,
														
 
															+            },
														
 
															+            'breakdown': {
														
 
															+                str(error): {
														
 
															+                    'count': count,
														
 
															+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
														
 
															+                }
														
 
															+                for error, count in error_counter.items()
														
 
															+            },
														
 
															+        },
														
 
															+        'statistics': {
														
 
															+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
														
 
															+            'costs': {
														
 
															+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
														
 
															+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
														
 
															+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
														
 
															+                if num_lines > 0
														
 
															+                else 0,
														
 
															+            },
														
 
															+        },
														
 
															+    }
														
 
															+
														
 
															+
														
 
															+def aggregate_directory(input_path) -> pd.DataFrame:
														
 
															+    # Process all output.jsonl files in subdirectories
														
 
															+    pattern = os.path.join(input_path, '**/output.jsonl')
														
 
															+    files = glob.glob(pattern, recursive=True)
														
 
															+    print(f'Processing {len(files)} files from directory {input_path}')
														
 
															+
														
 
															+    # Process each file silently and collect results
														
 
															+    results = []
														
 
															+    for file_path in files:
														
 
															+        try:
														
 
															+            result = process_file(file_path)
														
 
															+            results.append(result)
														
 
															+        except Exception as e:
														
 
															+            print(f'Error processing {file_path}: {str(e)}')
														
 
															+            import traceback
														
 
															+
														
 
															+            traceback.print_exc()
														
 
															+            continue
														
 
															+
														
 
															+    # Convert results to pandas DataFrame and sort by resolve rate
														
 
															+    df = pd.DataFrame(results)
														
 
															+
														
 
															+    # Extract directory name from file path
														
 
															+    df['directory'] = df['file_path'].apply(
														
 
															+        lambda x: os.path.basename(os.path.dirname(x))
														
 
															     )
														
 
															-    print(
														
 
															-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
														
 
															+
														
 
															+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
														
 
															+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
														
 
															+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
														
 
															+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
														
 
															+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
														
 
															+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
														
 
															+
														
 
															+    df = df.sort_values('resolve_rate', ascending=False)
														
 
															+
														
 
															+    return df
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    parser = argparse.ArgumentParser()
														
 
															+    parser.add_argument(
														
 
															+        'input_path', type=str, help='The file or directory to summarize'
														
 
															     )
														
 
															-    assert len(num_turns) == num_lines
														
 
															-    assert len(main_agent_cost) == num_lines
														
 
															-    assert len(editor_cost) == num_lines
														
 
															-    print('## Statistics')
														
 
															-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
														
 
															-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
														
 
															-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
														
 
															-    print(
														
 
															-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
														
 
															+    parser.add_argument(
														
 
															+        '--output',
														
 
															+        type=str,
														
 
															+        help='Output JSONL file for results',
														
 
															+        default='summary_results.jsonl',
														
 
															     )
														
 
															+    args = parser.parse_args()
														
 
															+
														
 
															+    if os.path.isdir(args.input_path):
														
 
															+        df = aggregate_directory(args.input_path)
														
 
															+        # Create the summary string
														
 
															+        columns = [
														
 
															+            'directory',
														
 
															+            'resolve_rate',
														
 
															+            'empty_patch_rate',
														
 
															+            'unfinished_rate',
														
 
															+            'error_rate',
														
 
															+            'avg_turns',
														
 
															+            'avg_cost',
														
 
															+            'total_instances',
														
 
															+        ]
														
 
															+        summary_str = df[columns].to_string(
														
 
															+            float_format=lambda x: '{:.2f}'.format(x),
														
 
															+            formatters={
														
 
															+                'directory': lambda x: x[:90]
														
 
															+            },  # Truncate directory names to 20 chars
														
 
															+            index=False,
														
 
															+        )
														
 
															+
														
 
															+        # Print to console
														
 
															+        print('\nResults summary (sorted by resolve rate):')
														
 
															+        print(summary_str)
														
 
															+
														
 
															+        # Save to text file
														
 
															+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
														
 
															+        with open(txt_output, 'w') as f:
														
 
															+            f.write('Results summary (sorted by resolve rate):\n')
														
 
															+            f.write(summary_str)
														
 
															+
														
 
															+        # Save
														
 
															+        df.to_json(args.output, lines=True, orient='records')
														
 
															+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
														
 
															+    else:
														
 
															+        # Process single file with detailed output
														
 
															+        results = []
														
 
															+        try:
														
 
															+            result = process_file(args.input_path)
														
 
															+            results.append(result)
														
 
															+
														
 
															+            # Print detailed results for single file
														
 
															+            print(f'\nResults for {args.input_path}:')
														
 
															+            print(
														
 
															+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
														
 
															+            )
														
 
															+            print('## Statistics')
														
 
															+            print(
														
 
															+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
														
 
															+            )
														
 
															+            print(
														
 
															+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
														
 
															+            )
														
 
															+
														
 
															+            print('## Detailed error breakdown:')
														
 
															+            for error, data in result['errors']['breakdown'].items():
														
 
															+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
														
 
															-    print('## Detailed error breakdown:')
														
 
															-    for error, count in error_counter.items():
														
 
															-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
														
 
															+        except Exception as e:
														
 
															+            print(f'Error processing {args.input_path}: {str(e)}')
														
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -431,7 +431,7 @@ def convert_fncall_messages_to_non_fncall_messages(
 
															                     tool_content = convert_tool_call_to_string(message['tool_calls'][0])
														
 
															                 except FunctionCallConversionError as e:
														
 
															                     raise FunctionCallConversionError(
														
 
															-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
														
 
															+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
														
 
															                     ) from e
														
 
															                 if isinstance(content, str):
														
 
															                     content += '\n\n' + tool_content
														
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -336,13 +336,13 @@ class RemoteRuntime(Runtime):
 
															         assert 'runtime_id' in runtime_data
														
 
															         assert runtime_data['runtime_id'] == self.runtime_id
														
 
															         assert 'pod_status' in runtime_data
														
 
															-        pod_status = runtime_data['pod_status']
														
 
															+        pod_status = runtime_data['pod_status'].lower()
														
 
															         self.log('debug', f'Pod status: {pod_status}')
														
 
															         # FIXME: We should fix it at the backend of /start endpoint, make sure
														
 
															         # the pod is created before returning the response.
														
 
															         # Retry a period of time to give the cluster time to start the pod
														
 
															-        if pod_status == 'Ready':
														
 
															+        if pod_status == 'ready':
														
 
															             try:
														
 
															                 with self._send_request(
														
 
															                     'GET',
														
@@ -358,14 +358,14 @@ class RemoteRuntime(Runtime):
 
															                 )
														
 
															             return
														
 
															         elif (
														
 
															-            pod_status == 'Not Found'
														
 
															-            or pod_status == 'Pending'
														
 
															-            or pod_status == 'Running'
														
 
															+            pod_status == 'not found'
														
 
															+            or pod_status == 'pending'
														
 
															+            or pod_status == 'running'
														
 
															         ):  # nb: Running is not yet Ready
														
 
															             raise RuntimeNotReadyError(
														
 
															                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
														
 
															             )
														
 
															-        elif pod_status in ('Failed', 'Unknown'):
														
 
															+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
														
 
															             # clean up the runtime
														
 
															             self.close()
														
 
															             raise RuntimeError(
														
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@@ -63,7 +63,7 @@ class SessionManager:
 
															                     await self._process_message(message)
														
 
															             except asyncio.CancelledError:
														
 
															                 return
														
 
															-            except:
														
 
															+            except Exception:
														
 
															                 try:
														
 
															                     asyncio.get_running_loop()
														
 
															                     logger.warning(