Просмотр исходного кода

[eval] increase timeout for SWEBench eval init/complete (#3829)

* [eval] increase timeout for swebench eval init/complete

* allow CmdRunAction to optionally block when .timeout is setted

* fix unit test for serialization

* fix unit tests for security analyzer

* fix integration tests

* add more timeout
Xingyao Wang 1 год назад
Родитель
Сommit
2fe2f4c530
42 измененных файлов с 38 добавлено и 157 удалено
  1. 10 0
      evaluation/swe_bench/run_infer.py
  2. 4 0
      openhands/events/action/commands.py
  3. 5 0
      openhands/events/event.py
  4. 4 2
      openhands/runtime/client/client.py
  5. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log
  6. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log
  7. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log
  8. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log
  9. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log
  10. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log
  11. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log
  12. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log
  13. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log
  14. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log
  15. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log
  16. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log
  17. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log
  18. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log
  19. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log
  20. 0 4
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log
  21. 1 5
      tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log
  22. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log
  23. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log
  24. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log
  25. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log
  26. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log
  27. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log
  28. 1 5
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log
  29. 0 4
      tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log
  30. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log
  31. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log
  32. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log
  33. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log
  34. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log
  35. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log
  36. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log
  37. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log
  38. 0 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log
  39. 1 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log
  40. 1 4
      tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log
  41. 1 0
      tests/unit/test_action_serialization.py
  42. 1 0
      tests/unit/test_security.py

+ 10 - 0
evaluation/swe_bench/run_infer.py

@@ -157,12 +157,14 @@ def initialize_runtime(
     action = CmdRunAction(
     action = CmdRunAction(
         command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
         command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc"""
     )
     )
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
     assert obs.exit_code == 0
 
 
     action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
     action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -201,18 +203,21 @@ def initialize_runtime(
             '/swe_util/',
             '/swe_util/',
         )
         )
         action = CmdRunAction(command='cat ~/.bashrc')
         action = CmdRunAction(command='cat ~/.bashrc')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert obs.exit_code == 0
         assert obs.exit_code == 0
 
 
         action = CmdRunAction(command='source ~/.bashrc')
         action = CmdRunAction(command='source ~/.bashrc')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         assert obs.exit_code == 0
         assert obs.exit_code == 0
 
 
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
         action = CmdRunAction(command='source /swe_util/instance_swe_entry.sh')
+        action.timeout = 600
         logger.info(action, extra={'msg_type': 'ACTION'})
         logger.info(action, extra={'msg_type': 'ACTION'})
         obs = runtime.run_action(action)
         obs = runtime.run_action(action)
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
         logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -234,6 +239,7 @@ def initialize_runtime(
     assert obs.exit_code == 0
     assert obs.exit_code == 0
 
 
     action = CmdRunAction(command='git reset --hard')
     action = CmdRunAction(command='git reset --hard')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -242,6 +248,7 @@ def initialize_runtime(
     action = CmdRunAction(
     action = CmdRunAction(
         command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
         command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
     )
     )
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
@@ -269,18 +276,21 @@ def complete_runtime(
     workspace_dir_name = _get_swebench_workspace_dir_name(instance)
     workspace_dir_name = _get_swebench_workspace_dir_name(instance)
 
 
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
     action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
     assert obs.exit_code == 0
 
 
     action = CmdRunAction(command='git config --global core.pager ""')
     action = CmdRunAction(command='git config --global core.pager ""')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert obs.exit_code == 0
     assert obs.exit_code == 0
 
 
     action = CmdRunAction(command='git add -A')
     action = CmdRunAction(command='git add -A')
+    action.timeout = 600
     logger.info(action, extra={'msg_type': 'ACTION'})
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
     obs = runtime.run_action(action)
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})

+ 4 - 0
openhands/events/action/commands.py

@@ -13,6 +13,10 @@ from openhands.events.action.action import (
 class CmdRunAction(Action):
 class CmdRunAction(Action):
     command: str
     command: str
     thought: str = ''
     thought: str = ''
+    blocking: bool = False
+    # If False, the command will be run in a non-blocking / interactive way
+    # The partial command outputs will be returned as output observation.
+    # If True, the command will be run for max .timeout seconds.
     keep_prompt: bool = True
     keep_prompt: bool = True
     # if True, the command prompt will be kept in the command output observation
     # if True, the command prompt will be kept in the command output observation
     # Example of command output:
     # Example of command output:

+ 5 - 0
openhands/events/event.py

@@ -49,3 +49,8 @@ class Event:
     @timeout.setter
     @timeout.setter
     def timeout(self, value: int | None) -> None:
     def timeout(self, value: int | None) -> None:
         self._timeout = value
         self._timeout = value
+
+        # Check if .blocking is an attribute of the event
+        if hasattr(self, 'blocking'):
+            # .blocking needs to be set to True if .timeout is set
+            self.blocking = True

+ 4 - 2
openhands/runtime/client/client.py

@@ -326,9 +326,11 @@ class RuntimeClient:
                 else:
                 else:
                     output, exit_code = self._execute_bash(
                     output, exit_code = self._execute_bash(
                         command,
                         command,
-                        timeout=SOFT_TIMEOUT_SECONDS,
+                        timeout=SOFT_TIMEOUT_SECONDS
+                        if not action.blocking
+                        else action.timeout,
                         keep_prompt=action.keep_prompt,
                         keep_prompt=action.keep_prompt,
-                        kill_on_timeout=False,
+                        kill_on_timeout=False if not action.blocking else True,
                     )
                     )
                 if all_output:
                 if all_output:
                     # previous output already exists with prompt "user@hostname:working_dir #""
                     # previous output already exists with prompt "user@hostname:working_dir #""

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
 need to finish a project:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "bad.txt\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}, {"source": "agent", "action": "read", "args": {"path": "bad.txt", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "This is a stupid typoo.\nReally?\nNo mor typos!\nEnjoy!\n", "extras": {"path": "/workspace/bad.txt"}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_edits/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
 need to finish a project:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software architect. Your team has inherited an existing codebase, and
 You are a software architect. Your team has inherited an existing codebase, and
 need to finish a project:
 need to finish a project:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
+[{"source": "agent", "action": "run", "args": {"command": "ls", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "ls", "exit_code": 0}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a software engineer. You've inherited an existing codebase, which you
 You are a software engineer. You've inherited an existing codebase, which you
 need to modify to complete this task:
 need to modify to complete this task:
@@ -41,7 +37,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "write", "args": {"path": "hello.sh", "content": "#!/bin/bash\n\necho 'hello'", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "write", "content": "", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "chmod +x hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "chmod +x hello.sh", "exit_code": 0}}, {"source": "agent", "action": "run", "args": {"command": "./hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "./hello.sh", "exit_code": 0}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
 codebase which are supposed to solve this task:

+ 0 - 4
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
 codebase which are supposed to solve this task:

+ 1 - 5
tests/integration/mock/eventstream_runtime/DelegatorAgent/test_write_simple_script/prompt_009.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a quality assurance engineer. Another engineer has made changes to the
 You are a quality assurance engineer. Another engineer has made changes to the
 codebase which are supposed to solve this task:
 codebase which are supposed to solve this task:
@@ -39,7 +35,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
+[{"source": "agent", "action": "read", "args": {"path": "hello.sh", "start": 0, "end": -1, "thought": ""}}, {"source": "agent", "observation": "read", "content": "#!/bin/bash\n\necho 'hello'\n", "extras": {"path": "/workspace/hello.sh"}}, {"source": "agent", "action": "run", "args": {"command": "bash hello.sh", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "hello\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "bash hello.sh", "exit_code": 0}}]
 
 
 ## Format
 ## Format
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:
 Your response MUST be in JSON format. It must be an object, and it must contain two fields:

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_001.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are in charge of accomplishing the following task:
 You are in charge of accomplishing the following task:
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_002.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_003.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}]
 
 
 If the last item in the history is an error, you should try to fix it.
 If the last item in the history is an error, you should try to fix it.
 
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_004.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 
 If the last item in the history is an error, you should try to fix it.
 If the last item in the history is an error, you should try to fix it.
 
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_005.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 
 If the last item in the history is an error, you should try to fix it.
 If the last item in the history is an error, you should try to fix it.
 
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_006.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 
 If the last item in the history is an error, you should try to fix it.
 If the last item in the history is an error, you should try to fix it.
 
 

+ 1 - 5
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_007.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are a responsible software engineer and always write good commit messages.
 You are a responsible software engineer and always write good commit messages.
 
 
@@ -28,7 +24,7 @@ as well as observations you've made. This only includes the MOST RECENT
 actions and observations--more may have happened before that.
 actions and observations--more may have happened before that.
 They are time-ordered, with your most recent action at the bottom.
 They are time-ordered, with your most recent action at the bottom.
 
 
-[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
+[{"source": "agent", "action": "run", "args": {"command": "git status", "thought": "", "blocking": false, "keep_prompt": true, "is_confirmed": "confirmed"}}, {"source": "agent", "observation": "run", "content": "fatal: not a git repository (or any parent up to mount point /)\r\nStopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\r\n\r\n[Python Interpreter: /openhands/poetry/openhands-ai-5O4_aCHf-py3.11/bin/python]\nopenhands@docker-desktop:/workspace $ ", "extras": {"command_id": -1, "command": "git status", "exit_code": 128}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}, {"source": "agent", "observation": "error", "content": "action={'action': 'reject', 'args': {'reason': 'Not a valid git repository.'}} has the wrong arguments", "extras": {}}]
 
 
 If the last item in the history is an error, you should try to fix it.
 If the last item in the history is an error, you should try to fix it.
 
 

+ 0 - 4
tests/integration/mock/eventstream_runtime/ManagerAgent/test_simple_task_rejection/prompt_008.log

@@ -1,7 +1,3 @@
-
-
-----------
-
 # Task
 # Task
 You are in charge of accomplishing the following task:
 You are in charge of accomplishing the following task:
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.
 Write a git commit message for the current staging area. Do not ask me for confirmation at any point.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_001.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_002.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_003.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_004.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_005.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_006.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_007.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_008.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 0 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_009.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.

+ 1 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_010.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.
@@ -198,6 +194,7 @@ ten actions--more happened before that.
     "args": {
     "args": {
       "command": "bash hello.sh",
       "command": "bash hello.sh",
       "thought": "",
       "thought": "",
+      "blocking": false,
       "keep_prompt": true,
       "keep_prompt": true,
       "is_confirmed": "confirmed"
       "is_confirmed": "confirmed"
     }
     }

+ 1 - 4
tests/integration/mock/eventstream_runtime/PlannerAgent/test_write_simple_script/prompt_011.log

@@ -1,8 +1,4 @@
 
 
-
-----------
-
-
 # Task
 # Task
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 You're a diligent software engineer AI. You can't see, draw, or interact with a
 browser, but you can read and write files, and you can run commands, and you can think.
 browser, but you can read and write files, and you can run commands, and you can think.
@@ -197,6 +193,7 @@ ten actions--more happened before that.
     "args": {
     "args": {
       "command": "bash hello.sh",
       "command": "bash hello.sh",
       "thought": "",
       "thought": "",
+      "blocking": false,
       "keep_prompt": true,
       "keep_prompt": true,
       "is_confirmed": "confirmed"
       "is_confirmed": "confirmed"
     }
     }

+ 1 - 0
tests/unit/test_action_serialization.py

@@ -84,6 +84,7 @@ def test_cmd_run_action_serialization_deserialization():
     original_action_dict = {
     original_action_dict = {
         'action': 'run',
         'action': 'run',
         'args': {
         'args': {
+            'blocking': False,
             'command': 'echo "Hello world"',
             'command': 'echo "Hello world"',
             'thought': '',
             'thought': '',
             'keep_prompt': True,
             'keep_prompt': True,

+ 1 - 0
tests/unit/test_security.py

@@ -219,6 +219,7 @@ def test_unsafe_bash_command(temp_dir: str):
                     function=Function(
                     function=Function(
                         name=ActionType.RUN,
                         name=ActionType.RUN,
                         arguments={
                         arguments={
+                            'blocking': False,
                             'command': 'ls',
                             'command': 'ls',
                             'keep_prompt': True,
                             'keep_prompt': True,
                             'is_confirmed': ActionConfirmationStatus.CONFIRMED,
                             'is_confirmed': ActionConfirmationStatus.CONFIRMED,