Răsfoiți Sursa

[eval] increase timeout for SWEBench eval init/complete (#3829)

* [eval] increase timeout for swebench eval init/complete

* allow CmdRunAction to optionally block when .timeout is setted

* fix unit test for serialization

* fix unit tests for security analyzer

* fix integration tests

* add more timeout
Xingyao Wang 1 an în urmă
părinte
comite
2fe2f4c530
42 a modificat fișierele cu 38 adăugiri și 157 ștergeri
  1. 10 0
      evaluation/swe_bench/run_infer.py
  2. 4 0
      openhands/events/action/commands.py
  3. 5 0
      openhands/events/event.py
  4. 4 2
      openhands/runtime/client/client.py
  5. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log
  6. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log
  7. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log
  8. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log
  9. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log
  10. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log
  11. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log
  12. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log
  13. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
  14. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
  15. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
  16. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
  17. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
  18. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
  19. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
  20. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
  21. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
  22. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
  23. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
  24. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
  25. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
  26. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
  27. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
  28. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log
  29. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log
  30. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
  31. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
  32. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
  33. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
  34. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
  35. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
  36. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
  37. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
  38. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
  39. 1 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
  40. 1 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
  41. 1 0
      tests/unit/test_action_serialization.py
  42. 1 0
      tests/unit/test_security.py

+ 10 - 0
evaluation/swe_bench/run_infer.py

@@ -157,12 +157,14 @@ def initialize_runtime(
     action = CmdRunAction(
         command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
     )
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
     action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -201,18 +203,21 @@ def initialize_runtime(
             '/swe_util/',
         )
         action = CmdRunAction(command='cat ~/.bashrc')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert obs.exit_code == 0
 
         action = CmdRunAction(command='source ~/.bashrc')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert obs.exit_code == 0
 
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -234,6 +239,7 @@ def initialize_runtime(
     assert obs.exit_code == 0
 
     action = CmdRunAction(command='git reset --hard')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -242,6 +248,7 @@ def initialize_runtime(
     action = CmdRunAction(
         command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
     )
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -269,18 +276,21 @@ def complete_runtime(
     workspace_dir_name = _get_swebench_workspace_dir_name(instance)
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
     action = CmdRunAction(command='git config --global core.pager ""')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
 
     action = CmdRunAction(command='git add -A')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})

+ 4 - 0
openhands/events/action/commands.py

@@ -13,6 +13,10 @@ from openhands.events.action.action import (
 class CmdRunAction(Action):
     command: str
     thought: str = ''
+    blocking: bool = False
+    # If False, the command will be run in a non-blocking / interactive way
+    # The partial command outputs will be returned as output observation.
+    # If True, the command will be run for max .timeout seconds.
     keep_prompt: bool = True
     # if True, the command prompt will be kept in the command output observation
     # Example of command output:

+ 5 - 0
openhands/events/event.py

@@ -49,3 +49,8 @@ class Event:
     @timeout.setter
     def timeout(self, value: int | None) -> None:
         self._timeout = value
+
+        # Check if .blocking is an attribute of the event
+        if hasattr(self, 'blocking'):
+            # .blocking needs to be set to True if .timeout is set
+            self.blocking = True

+ 4 - 2
openhands/runtime/client/client.py

@@ -326,9 +326,11 @@ class RuntimeClient:
                 else:
                     output, exit_code = self._execute_bash(
                         command,
-                        timeout=SOFT_TIMEOUT_SECONDS,
+                        timeout=SOFT_TIMEOUT_SECONDS
+                        if not action.blocking
+                        else action.timeout,
                         keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False,
+                        kill_on_timeout=False if not action.blocking else True,
                     )
                 if all_output:
                     # previous output already exists with prompt "user@hostname:working_dir #""

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
@@ -39,7 +35,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
 
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are in charge of accomplishing the following task:
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
 
 If the last item in the history is an error, you should try to fix it.
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are a responsible software engineer and always write good commit messages.
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 If the last item in the history is an error, you should try to fix it.
 

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 You are in charge of accomplishing the following task:
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.

+ 1 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
@@ -198,6 +194,7 @@ ten actions--more happened before that.
     "args": {
       "command": "bash hello.sh",
       "thought": "",
+      "blocking": false,
       "keep_prompt": true,
       "is_confirmed": "confirmed"
     }

+ 1 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log

@@ -1,8 +1,4 @@
 
-
-----------
-
-
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
@@ -197,6 +193,7 @@ ten actions--more happened before that.
     "args": {
       "command": "bash hello.sh",
       "thought": "",
+      "blocking": false,
       "keep_prompt": true,
       "is_confirmed": "confirmed"
     }

+ 1 - 0
tests/unit/test_action_serialization.py

@@ -84,6 +84,7 @@ def test_cmd_run_action_serialization_deserialization():
     original_action_dict = {
         'action': 'run',
         'args': {
+            'blocking': False,
             'command': 'echo "Hello world"',
             'thought': '',
             'keep_prompt': True,

+ 1 - 0
tests/unit/test_security.py

@@ -219,6 +219,7 @@ def test_unsafe_bash_command(temp_dir: str):
                     function=Function(
                         name=ActionType.RUN,
                         arguments={
+                            'blocking': False,
                             'command': 'ls',
                             'keep_prompt': True,
                             'is_confirmed': ActionConfirmationStatus.CONFIRMED,