1 vuosi sitten · 990f277132
--- a/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/swe_bench/scripts/eval/summarize_outputs.py
@@ -1,8 +1,12 @@
 
				 #!/usr/bin/env python3
			
 
				 import argparse
			
 
				+import glob
			
 
				 import json
			
 
				+import os
			
 
				 from collections import Counter
			
 
				 
			
 
				+import pandas as pd
			
 
				+
			
 
				 from openhands.events.serialization import event_from_dict
			
 
				 from openhands.events.utils import get_pairs_from_events
			
 
				 
			
@@ -10,25 +14,21 @@ ERROR_KEYWORDS = [
 
				     'Agent encountered an error while processing the last action',
			
 
				     'APIError',
			
 
				     'Action execution failed',
			
 
				+    'litellm.Timeout: APITimeoutError',
			
 
				 ]
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    parser.add_argument('output_file', type=str, help='The file to summarize')
			
 
				-    args = parser.parse_args()
			
 
				 
			
 
				-    with open(args.output_file, 'r') as file:
			
 
				+def process_file(file_path):
			
 
				+    with open(file_path, 'r') as file:
			
 
				         lines = file.readlines()
			
 
				 
			
 
				     num_lines = len(lines)
			
 
				     num_error_lines = 0
			
 
				     num_agent_stuck_in_loop = 0
			
 
				-
			
 
				     num_resolved = 0
			
 
				     num_empty_patch = 0
			
 
				-
			
 
				+    num_unfinished_runs = 0
			
 
				     error_counter = Counter()
			
 
				-
			
 
				     main_agent_cost = []
			
 
				     editor_cost = []
			
 
				     num_turns = []
			
@@ -36,6 +36,11 @@ if __name__ == '__main__':
 
				     for line in lines:
			
 
				         _d = json.loads(line)
			
 
				 
			
 
				+        if 'metrics' not in _d or _d['metrics'] is None:
			
 
				+            # this is a failed run
			
 
				+            num_unfinished_runs += 1
			
 
				+            continue
			
 
				+
			
 
				         # Cost
			
 
				         costs = _d['metrics'].get('costs', [])
			
 
				         _cur_main_agent_cost = 0
			
@@ -89,30 +94,180 @@ if __name__ == '__main__':
 
				                 num_error_lines += 1
			
 
				                 break
			
 
				 
			
 
				-    # print the error counter (with percentage)
			
 
				-    print(
			
 
				-        f'Number of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
			
 
				-    )
			
 
				-    print(
			
 
				-        f'Number of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
			
 
				-    )
			
 
				-    print(
			
 
				-        f'Number of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
			
 
				+    return {
			
 
				+        'file_path': file_path,
			
 
				+        'total_instances': num_lines,
			
 
				+        'resolved': {
			
 
				+            'count': num_resolved,
			
 
				+            'percentage': (num_resolved / num_lines * 100) if num_lines > 0 else 0,
			
 
				+        },
			
 
				+        'empty_patches': {
			
 
				+            'count': num_empty_patch,
			
 
				+            'percentage': (num_empty_patch / num_lines * 100) if num_lines > 0 else 0,
			
 
				+        },
			
 
				+        'unfinished_runs': {
			
 
				+            'count': num_unfinished_runs,
			
 
				+            'percentage': (num_unfinished_runs / num_lines * 100)
			
 
				+            if num_lines > 0
			
 
				+            else 0,
			
 
				+        },
			
 
				+        'errors': {
			
 
				+            'total': num_error_lines,
			
 
				+            'percentage': (num_error_lines / num_lines * 100) if num_lines > 0 else 0,
			
 
				+            'stuck_in_loop': {
			
 
				+                'count': num_agent_stuck_in_loop,
			
 
				+                'percentage': (num_agent_stuck_in_loop / num_lines * 100)
			
 
				+                if num_lines > 0
			
 
				+                else 0,
			
 
				+            },
			
 
				+            'breakdown': {
			
 
				+                str(error): {
			
 
				+                    'count': count,
			
 
				+                    'percentage': (count / num_lines * 100) if num_lines > 0 else 0,
			
 
				+                }
			
 
				+                for error, count in error_counter.items()
			
 
				+            },
			
 
				+        },
			
 
				+        'statistics': {
			
 
				+            'avg_turns': sum(num_turns) / num_lines if num_lines > 0 else 0,
			
 
				+            'costs': {
			
 
				+                'main_agent': sum(main_agent_cost) / num_lines if num_lines > 0 else 0,
			
 
				+                'editor': sum(editor_cost) / num_lines if num_lines > 0 else 0,
			
 
				+                'total': (sum(main_agent_cost) + sum(editor_cost)) / num_lines
			
 
				+                if num_lines > 0
			
 
				+                else 0,
			
 
				+            },
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+def aggregate_directory(input_path) -> pd.DataFrame:
			
 
				+    # Process all output.jsonl files in subdirectories
			
 
				+    pattern = os.path.join(input_path, '**/output.jsonl')
			
 
				+    files = glob.glob(pattern, recursive=True)
			
 
				+    print(f'Processing {len(files)} files from directory {input_path}')
			
 
				+
			
 
				+    # Process each file silently and collect results
			
 
				+    results = []
			
 
				+    for file_path in files:
			
 
				+        try:
			
 
				+            result = process_file(file_path)
			
 
				+            results.append(result)
			
 
				+        except Exception as e:
			
 
				+            print(f'Error processing {file_path}: {str(e)}')
			
 
				+            import traceback
			
 
				+
			
 
				+            traceback.print_exc()
			
 
				+            continue
			
 
				+
			
 
				+    # Convert results to pandas DataFrame and sort by resolve rate
			
 
				+    df = pd.DataFrame(results)
			
 
				+
			
 
				+    # Extract directory name from file path
			
 
				+    df['directory'] = df['file_path'].apply(
			
 
				+        lambda x: os.path.basename(os.path.dirname(x))
			
 
				     )
			
 
				-    print(
			
 
				-        f'Number of agent stuck in loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
			
 
				+
			
 
				+    df['resolve_rate'] = df['resolved'].apply(lambda x: x['percentage'])
			
 
				+    df['empty_patch_rate'] = df['empty_patches'].apply(lambda x: x['percentage'])
			
 
				+    df['unfinished_rate'] = df['unfinished_runs'].apply(lambda x: x['percentage'])
			
 
				+    df['avg_turns'] = df['statistics'].apply(lambda x: x['avg_turns'])
			
 
				+    df['error_rate'] = df['errors'].apply(lambda x: x['percentage'])
			
 
				+    df['avg_cost'] = df['statistics'].apply(lambda x: x['costs']['total'])
			
 
				+
			
 
				+    df = df.sort_values('resolve_rate', ascending=False)
			
 
				+
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument(
			
 
				+        'input_path', type=str, help='The file or directory to summarize'
			
 
				     )
			
 
				-    assert len(num_turns) == num_lines
			
 
				-    assert len(main_agent_cost) == num_lines
			
 
				-    assert len(editor_cost) == num_lines
			
 
				-    print('## Statistics')
			
 
				-    print(f'Avg. num of turns per instance: {sum(num_turns) / num_lines:.2f}')
			
 
				-    print(f'Avg. agent cost per instance: {sum(main_agent_cost) / num_lines:.2f} USD')
			
 
				-    print(f'Avg. editor cost per instance: {sum(editor_cost) / num_lines:.2f} USD')
			
 
				-    print(
			
 
				-        f'Avg. total cost per instance: {(sum(main_agent_cost) + sum(editor_cost)) / num_lines:.2f} USD'
			
 
				+    parser.add_argument(
			
 
				+        '--output',
			
 
				+        type=str,
			
 
				+        help='Output JSONL file for results',
			
 
				+        default='summary_results.jsonl',
			
 
				     )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if os.path.isdir(args.input_path):
			
 
				+        df = aggregate_directory(args.input_path)
			
 
				+        # Create the summary string
			
 
				+        columns = [
			
 
				+            'directory',
			
 
				+            'resolve_rate',
			
 
				+            'empty_patch_rate',
			
 
				+            'unfinished_rate',
			
 
				+            'error_rate',
			
 
				+            'avg_turns',
			
 
				+            'avg_cost',
			
 
				+            'total_instances',
			
 
				+        ]
			
 
				+        summary_str = df[columns].to_string(
			
 
				+            float_format=lambda x: '{:.2f}'.format(x),
			
 
				+            formatters={
			
 
				+                'directory': lambda x: x[:90]
			
 
				+            },  # Truncate directory names to 20 chars
			
 
				+            index=False,
			
 
				+        )
			
 
				+
			
 
				+        # Print to console
			
 
				+        print('\nResults summary (sorted by resolve rate):')
			
 
				+        print(summary_str)
			
 
				+
			
 
				+        # Save to text file
			
 
				+        txt_output = args.output.rsplit('.', 1)[0] + '.txt'
			
 
				+        with open(txt_output, 'w') as f:
			
 
				+            f.write('Results summary (sorted by resolve rate):\n')
			
 
				+            f.write(summary_str)
			
 
				+
			
 
				+        # Save
			
 
				+        df.to_json(args.output, lines=True, orient='records')
			
 
				+        df[columns].to_csv(args.output.rsplit('.', 1)[0] + '.csv', index=False)
			
 
				+    else:
			
 
				+        # Process single file with detailed output
			
 
				+        results = []
			
 
				+        try:
			
 
				+            result = process_file(args.input_path)
			
 
				+            results.append(result)
			
 
				+
			
 
				+            # Print detailed results for single file
			
 
				+            print(f'\nResults for {args.input_path}:')
			
 
				+            print(
			
 
				+                f"Number of resolved: {result['resolved']['count']} / {result['total_instances']} ({result['resolved']['percentage']:.2f}%)"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Number of empty patch: {result['empty_patches']['count']} / {result['total_instances']} ({result['empty_patches']['percentage']:.2f}%)"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Number of error lines: {result['errors']['total']} / {result['total_instances']} ({result['errors']['percentage']:.2f}%)"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Number of agent stuck in loop: {result['errors']['stuck_in_loop']['count']} / {result['total_instances']} ({result['errors']['stuck_in_loop']['percentage']:.2f}%)"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Number of unfinished runs: {result['unfinished_runs']['count']} / {result['total_instances']} ({result['unfinished_runs']['percentage']:.2f}%)"
			
 
				+            )
			
 
				+            print('## Statistics')
			
 
				+            print(
			
 
				+                f"Avg. num of turns per instance: {result['statistics']['avg_turns']:.2f}"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Avg. agent cost per instance: {result['statistics']['costs']['main_agent']:.2f} USD"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Avg. editor cost per instance: {result['statistics']['costs']['editor']:.2f} USD"
			
 
				+            )
			
 
				+            print(
			
 
				+                f"Avg. total cost per instance: {result['statistics']['costs']['total']:.2f} USD"
			
 
				+            )
			
 
				+
			
 
				+            print('## Detailed error breakdown:')
			
 
				+            for error, data in result['errors']['breakdown'].items():
			
 
				+                print(f"{error}: {data['count']} ({data['percentage']:.2f}%)")
			
 
				 
			
 
				-    print('## Detailed error breakdown:')
			
 
				-    for error, count in error_counter.items():
			
 
				-        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
			
 
				+        except Exception as e:
			
 
				+            print(f'Error processing {args.input_path}: {str(e)}')
			
--- a/openhands/llm/fn_call_converter.py
+++ b/openhands/llm/fn_call_converter.py
@@ -431,7 +431,7 @@ def convert_fncall_messages_to_non_fncall_messages(
 
				                     tool_content = convert_tool_call_to_string(message['tool_calls'][0])
			
 
				                 except FunctionCallConversionError as e:
			
 
				                     raise FunctionCallConversionError(
			
 
				-                        f'Failed to convert tool call to string. Raw messages: {json.dumps(messages, indent=2)}'
			
 
				+                        f'Failed to convert tool call to string.\nCurrent tool call: {message["tool_calls"][0]}.\nRaw messages: {json.dumps(messages, indent=2)}'
			
 
				                     ) from e
			
 
				                 if isinstance(content, str):
			
 
				                     content += '\n\n' + tool_content
			
--- a/openhands/runtime/impl/remote/remote_runtime.py
+++ b/openhands/runtime/impl/remote/remote_runtime.py
@@ -336,13 +336,13 @@ class RemoteRuntime(Runtime):
 
				         assert 'runtime_id' in runtime_data
			
 
				         assert runtime_data['runtime_id'] == self.runtime_id
			
 
				         assert 'pod_status' in runtime_data
			
 
				-        pod_status = runtime_data['pod_status']
			
 
				+        pod_status = runtime_data['pod_status'].lower()
			
 
				         self.log('debug', f'Pod status: {pod_status}')
			
 
				 
			
 
				         # FIXME: We should fix it at the backend of /start endpoint, make sure
			
 
				         # the pod is created before returning the response.
			
 
				         # Retry a period of time to give the cluster time to start the pod
			
 
				-        if pod_status == 'Ready':
			
 
				+        if pod_status == 'ready':
			
 
				             try:
			
 
				                 with self._send_request(
			
 
				                     'GET',
			
@@ -358,14 +358,14 @@ class RemoteRuntime(Runtime):
 
				                 )
			
 
				             return
			
 
				         elif (
			
 
				-            pod_status == 'Not Found'
			
 
				-            or pod_status == 'Pending'
			
 
				-            or pod_status == 'Running'
			
 
				+            pod_status == 'not found'
			
 
				+            or pod_status == 'pending'
			
 
				+            or pod_status == 'running'
			
 
				         ):  # nb: Running is not yet Ready
			
 
				             raise RuntimeNotReadyError(
			
 
				                 f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
			
 
				             )
			
 
				-        elif pod_status in ('Failed', 'Unknown'):
			
 
				+        elif pod_status in ('failed', 'unknown', 'crashloopbackoff'):
			
 
				             # clean up the runtime
			
 
				             self.close()
			
 
				             raise RuntimeError(
			
--- a/openhands/server/session/manager.py
+++ b/openhands/server/session/manager.py
@@ -63,7 +63,7 @@ class SessionManager:
 
				                     await self._process_message(message)
			
 
				             except asyncio.CancelledError:
			
 
				                 return
			
 
				-            except:
			
 
				+            except Exception:
			
 
				                 try:
			
 
				                     asyncio.get_running_loop()
			
 
				                     logger.warning(