1 жил өмнө · 1f23dc89b6
--- a/docs/modules/usage/how-to/evaluation-harness.md
+++ b/docs/modules/usage/how-to/evaluation-harness.md
@@ -134,9 +134,11 @@ To create an evaluation workflow for your benchmark, follow these steps:
 
				 
			
 
				 4. Create a function to process each instance:
			
 
				    ```python
			
 
				+   from openhands.utils.async_utils import call_async_from_sync
			
 
				    def process_instance(instance: pd.Series, metadata: EvalMetadata) -> EvalOutput:
			
 
				        config = get_config(instance, metadata)
			
 
				        runtime = create_runtime(config)
			
 
				+       call_async_from_sync(runtime.connect)
			
 
				        initialize_runtime(runtime, instance)
			
 
				 
			
 
				        instruction = get_instruction(instance, metadata)
			
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -23,6 +23,7 @@ from openhands.core.config import (
 
				 from openhands.core.logger import openhands_logger as logger
			
 
				 from openhands.core.main import create_runtime, run_controller
			
 
				 from openhands.events.action import MessageAction
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 game = None
			
 
				 
			
@@ -119,6 +120,7 @@ def process_instance(
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				 
			
 
				     state: State | None = asyncio.run(
			
 
				         run_controller(
			
--- a/evaluation/agent_bench/run_infer.py
+++ b/evaluation/agent_bench/run_infer.py
@@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 
			
 
				 def get_config(
			
@@ -210,6 +211,7 @@ def process_instance(
 
				     # =============================================
			
 
				 
			
 
				     runtime: Runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				 
			
 
				     initialize_runtime(runtime, instance=instance)
			
 
				 
			
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 # Configure visibility of unit tests to the Agent.
			
 
				 USE_UNIT_TESTS = os.environ.get('USE_UNIT_TESTS', 'false').lower() == 'true'
			
@@ -207,6 +208,7 @@ def process_instance(
 
				     # =============================================
			
 
				 
			
 
				     runtime: Runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				 
			
 
				     initialize_runtime(runtime, instance=instance)
			
 
				 
			
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -30,6 +30,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': functools.partial(
			
@@ -275,7 +276,7 @@ def process_instance(
 
				     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				-
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
--- a/evaluation/bird/run_infer.py
+++ b/evaluation/bird/run_infer.py
@@ -33,6 +33,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 
			
 
				 def codeact_user_response(state: State) -> str:
			
@@ -403,6 +404,7 @@ def process_instance(
 
				     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
--- a/evaluation/gaia/run_infer.py
+++ b/evaluation/gaia/run_infer.py
@@ -29,6 +29,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import AgentFinishAction, CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 DATASET_CACHE_DIR = os.path.join(os.path.dirname(__file__), 'data')
			
 
				 
			
@@ -142,6 +143,7 @@ def process_instance(
 
				     logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
--- a/evaluation/gorilla/run_infer.py
+++ b/evaluation/gorilla/run_infer.py
@@ -25,6 +25,7 @@ from openhands.core.config import (
 
				 from openhands.core.logger import openhands_logger as logger
			
 
				 from openhands.core.main import create_runtime, run_controller
			
 
				 from openhands.events.action import MessageAction
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': codeact_user_response,
			
@@ -81,6 +82,7 @@ def process_instance(
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     state: State | None = asyncio.run(
			
 
				         run_controller(
			
 
				             config=config,
			
--- a/evaluation/gpqa/run_infer.py
+++ b/evaluation/gpqa/run_infer.py
@@ -48,6 +48,7 @@ from openhands.events.action import (
 
				     MessageAction,
			
 
				 )
			
 
				 from openhands.events.observation import Observation
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 ACTION_FORMAT = """
			
 
				 <<FINAL_ANSWER||
			
@@ -215,7 +216,7 @@ Ok now its time to start solving the question. Good luck!
 
				 """
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				-
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     state: State | None = asyncio.run(
			
 
				         run_controller(
			
 
				             config=config,
			
--- a/evaluation/humanevalfix/run_infer.py
+++ b/evaluation/humanevalfix/run_infer.py
@@ -38,6 +38,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 IMPORT_HELPER = {
			
 
				     'python': [
			
@@ -233,6 +234,7 @@ def process_instance(
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				     state: State | None = asyncio.run(
			
 
				         run_controller(
			
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -25,6 +25,7 @@ from openhands.core.logger import openhands_logger as logger
 
				 from openhands.core.main import create_runtime, run_controller
			
 
				 from openhands.events.action import MessageAction
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 FAKE_RESPONSES = {
			
 
				     'CodeActAgent': codeact_user_response,
			
@@ -101,6 +102,7 @@ def process_instance(
 
				     # =============================================
			
 
				 
			
 
				     runtime: Runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				 
			
 
				     test_class.initialize_runtime(runtime)
			
 
				 
			
--- a/evaluation/logic_reasoning/run_infer.py
+++ b/evaluation/logic_reasoning/run_infer.py
@@ -30,6 +30,7 @@ from openhands.events.action import (
 
				 )
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': codeact_user_response,
			
@@ -202,6 +203,7 @@ def process_instance(
 
				     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -35,6 +35,7 @@ from openhands.runtime.browser.browser_env import (
 
				     BROWSER_EVAL_GET_GOAL_ACTION,
			
 
				     BROWSER_EVAL_GET_REWARDS_ACTION,
			
 
				 )
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
			
 
				 
			
@@ -127,6 +128,7 @@ def process_instance(
 
				         logger.info(f'Starting evaluation for instance {env_id}.')
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     task_str = initialize_runtime(runtime)
			
 
				     state: State | None = asyncio.run(
			
 
				         run_controller(
			
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -33,6 +33,7 @@ from openhands.events.action import (
 
				 )
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 
			
 
				 def codeact_user_response_mint(state: State, task: Task, task_config: dict[str, int]):
			
@@ -176,6 +177,7 @@ def process_instance(
 
				     )
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime)
			
 
				 
			
 
				     state: State | None = asyncio.run(
			
--- a/evaluation/ml_bench/run_infer.py
+++ b/evaluation/ml_bench/run_infer.py
@@ -42,6 +42,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 config = load_app_config()
			
 
				 
			
@@ -233,6 +234,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
 
				     instruction += AGENT_CLS_TO_INST_SUFFIX[metadata.agent_class]
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime, instance)
			
 
				 
			
 
				     # Run the agent
			
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -28,6 +28,7 @@ from openhands.core.logger import openhands_logger as logger
 
				 from openhands.core.main import create_runtime
			
 
				 from openhands.events.action import CmdRunAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 # TODO: migrate all swe-bench docker to ghcr.io/openhands
			
 
				 DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
			
@@ -128,7 +129,7 @@ def process_instance(
 
				         )
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				-
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     # Get patch and save it to /tmp/patch.diff
			
 
				     with tempfile.TemporaryDirectory() as temp_dir:
			
 
				         # Patch file
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -35,6 +35,7 @@ from openhands.events.observation import CmdOutputObservation, ErrorObservation
 
				 from openhands.events.serialization.event import event_to_dict
			
 
				 from openhands.runtime.base import Runtime
			
 
				 from openhands.runtime.utils.shutdown_listener import sleep_if_should_continue
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
			
 
				 USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false').lower() == 'true'
			
@@ -380,6 +381,7 @@ def process_instance(
 
				         logger.info(f'Starting evaluation for instance {instance.instance_id}.')
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				 
			
 
				     try:
			
 
				         initialize_runtime(runtime, instance)
			
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -26,6 +26,7 @@ from openhands.core.main import create_runtime, run_controller
 
				 from openhands.events.action import CmdRunAction, MessageAction
			
 
				 from openhands.events.observation import CmdOutputObservation
			
 
				 from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': codeact_user_response,
			
@@ -103,6 +104,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
 
				     logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     initialize_runtime(runtime)
			
 
				 
			
 
				     # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
--- a/evaluation/webarena/run_infer.py
+++ b/evaluation/webarena/run_infer.py
@@ -35,6 +35,7 @@ from openhands.runtime.browser.browser_env import (
 
				     BROWSER_EVAL_GET_GOAL_ACTION,
			
 
				     BROWSER_EVAL_GET_REWARDS_ACTION,
			
 
				 )
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				 
			
 
				 SUPPORTED_AGENT_CLS = {'BrowsingAgent'}
			
 
				 
			
@@ -143,6 +144,7 @@ def process_instance(
 
				         logger.info(f'Starting evaluation for instance {env_id}.')
			
 
				 
			
 
				     runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				     task_str = initialize_runtime(runtime)
			
 
				 
			
 
				     state: State | None = asyncio.run(
			
--- a/openhands/core/main.py
+++ b/openhands/core/main.py
@@ -122,7 +122,7 @@ async def run_controller(
 
				 
			
 
				     if runtime is None:
			
 
				         runtime = create_runtime(config, sid=sid)
			
 
				-    await runtime.connect()
			
 
				+        await runtime.connect()
			
 
				 
			
 
				     event_stream = runtime.event_stream
			
 
				     # restore cli session if enabled
			
--- a/openhands/runtime/utils/shutdown_listener.py
+++ b/openhands/runtime/utils/shutdown_listener.py
@@ -4,6 +4,7 @@ This module monitors the app for shutdown signals
 
				 
			
 
				 import asyncio
			
 
				 import signal
			
 
				+import threading
			
 
				 import time
			
 
				 from types import FrameType
			
 
				 
			
@@ -29,8 +30,11 @@ def _register_signal_handlers():
 
				     if _should_exit is not None:
			
 
				         return
			
 
				     _should_exit = False
			
 
				-    for sig in HANDLED_SIGNALS:
			
 
				-        _register_signal_handler(sig)
			
 
				+
			
 
				+    # Check if we're in the main thread of the main interpreter
			
 
				+    if threading.current_thread() is threading.main_thread():
			
 
				+        for sig in HANDLED_SIGNALS:
			
 
				+            _register_signal_handler(sig)
			
 
				 
			
 
				 
			
 
				 def should_exit() -> bool: