Просмотр исходного кода

fix: runtime test for mac (#3005)

* move use_host_network to sandbox config

* fix test runtime tests

* fix kwargs to make it clearer
Xingyao Wang 1 год назад
Родитель
Сommit
ff6ddc831f

+ 6 - 5
config.template.toml

@@ -25,9 +25,6 @@ workspace_base = "./workspace"
 # Disable color in terminal output
 #disable_color = false
 
-# Enable auto linting after editing
-#enable_auto_lint = false
-
 # Enable saving and restoring the session when run from CLI
 #enable_cli_session = false
 
@@ -76,8 +73,6 @@ persist_sandbox = false
 # SSH port for the sandbox
 #ssh_port = 63710
 
-# Use host network
-#use_host_network = false
 
 # Name of the default agent
 #default_agent = "CodeActAgent"
@@ -197,6 +192,12 @@ llm_config = 'gpt3'
 # Container image to use for the sandbox
 #container_image = "ghcr.io/opendevin/sandbox:main"
 
+# Use host network
+#use_host_network = false
+
+# Enable auto linting after editing
+#enable_auto_lint = false
+
 #################################### Eval ####################################
 # Configuration for the evaluation, please refer to the specific evaluation
 # plugin for the available options

+ 4 - 2
evaluation/TUTORIAL.md

@@ -33,13 +33,15 @@ workspace_mount_path = "/path/to/your/workspace"
 
 ssh_hostname = "localhost"
 
+run_as_devin = false
+
+[sandbox]
 # SWEBench eval specific - but you can tweak it to your needs
 use_host_network = false
-run_as_devin = false
 # linting python after editing helps LLM fix indentations
 enable_auto_lint = true
 
-[sandbox]
+
 box_type = "ssh"
 timeout = 120
 

+ 2 - 2
evaluation/agent_bench/README.md

@@ -20,12 +20,12 @@ workspace_mount_path = "/path/to/workspace"
 
 ssh_hostname = "localhost"
 
-use_host_network = false
 # AgentBench specific
 run_as_devin = true
-enable_auto_lint = true
 
 [sandbox]
+use_host_network = false
+enable_auto_lint = true
 box_type = "ssh"
 timeout = 120
 

+ 2 - 2
evaluation/biocoder/biocoder_env_box.py

@@ -217,7 +217,7 @@ class BiocoderSSHBox(DockerSSHBox):
             config.workspace_mount_path = workspace_base
 
             # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
+            config.sandbox.enable_auto_lint = True
 
             # create folder for transferring files back/forth
             biocoder_cache_folder = 'biocoder_cache'
@@ -268,7 +268,7 @@ class BiocoderSSHBox(DockerSSHBox):
                 f.write(json.dumps(testcase_json, indent=4))
 
             # linting python after editing helps LLM fix indentations
-            config.enable_auto_lint = True
+            config.sandbox.enable_auto_lint = True
 
             sandbox = cls(
                 container_image=BIOCODER_BENCH_CONTAINER_IMAGE,

+ 2 - 0
evaluation/bird/README.md

@@ -18,6 +18,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true
 
 # TODO: Change these to the model you want to evaluate

+ 2 - 0
evaluation/gpqa/README.md

@@ -36,6 +36,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true
 
 # TODO: Change these to the model you want to evaluate

+ 2 - 0
evaluation/humanevalfix/README.md

@@ -18,6 +18,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true
 
 # TODO: Change these to the model you want to evaluate

+ 2 - 0
evaluation/logic_reasoning/README.md

@@ -13,6 +13,8 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
+
+[sandbox]
 enable_auto_lint = true
 
 # TODO: Change these to the model you want to evaluate

+ 4 - 1
evaluation/ml_bench/README.md

@@ -25,10 +25,13 @@ Add the following configurations:
 max_iterations = 100
 cache_dir = "/tmp/cache"
 ssh_hostname = "localhost"
-enable_auto_lint = true
 run_as_devin = false
 sandbox_container_image = "public.ecr.aws/i5g0m1f6/ml-bench" # Use the latest image from the ML-Bench repository
 
+[sandbox]
+enable_auto_lint = true
+
+
 # TODO: Change these to the model you want to evaluate
 [llm.eval_gpt4_1106_preview]
 model = "gpt-4-1106-preview"

+ 4 - 2
evaluation/swe_bench/README.md

@@ -50,11 +50,13 @@ ssh_hostname = "localhost"
 box_type = "ssh"
 timeout = 120
 
+run_as_devin = false
+max_budget_per_task = 4 # 4 USD
+
+[sandbox]
 # SWEBench eval specific
 use_host_network = false
-run_as_devin = false
 enable_auto_lint = true
-max_budget_per_task = 4 # 4 USD
 
 # TODO: Change these to the model you want to evaluate
 [llm.eval_gpt4_1106_preview_llm]

+ 5 - 4
opendevin/core/config.py

@@ -139,7 +139,9 @@ class SandboxConfig(metaclass=Singleton):
         container_image: The container image to use for the sandbox.
         user_id: The user ID for the sandbox.
         timeout: The timeout for the sandbox.
-
+        enable_auto_lint: Whether to enable auto-lint.
+        use_host_network: Whether to use the host network.
+        initialize_plugins: Whether to initialize plugins.
     """
 
     box_type: str = 'ssh'
@@ -153,6 +155,7 @@ class SandboxConfig(metaclass=Singleton):
     enable_auto_lint: bool = (
         False  # once enabled, OpenDevin would lint files after editing
     )
+    use_host_network: bool = False
     initialize_plugins: bool = True
 
     def defaults_to_dict(self) -> dict:
@@ -201,7 +204,6 @@ class AppConfig(metaclass=Singleton):
         max_iterations: The maximum number of iterations.
         max_budget_per_task: The maximum budget allowed per task, beyond which the agent will stop.
         e2b_api_key: The E2B API key.
-        use_host_network: Whether to use the host network.
         ssh_hostname: The SSH hostname.
         disable_color: Whether to disable color. For terminals that don't support color.
         debug: Whether to enable debugging.
@@ -230,7 +232,6 @@ class AppConfig(metaclass=Singleton):
     max_iterations: int = 100
     max_budget_per_task: float | None = None
     e2b_api_key: str = ''
-    use_host_network: bool = False
     ssh_hostname: str = 'localhost'
     disable_color: bool = False
     persist_sandbox: bool = False
@@ -531,7 +532,7 @@ def finalize_config(cfg: AppConfig):
         if llm.embedding_base_url is None:
             llm.embedding_base_url = llm.base_url
 
-    if cfg.use_host_network and platform.system() == 'Darwin':
+    if cfg.sandbox.use_host_network and platform.system() == 'Darwin':
         logger.opendevin_logger.warning(
             'Please upgrade to Docker Desktop 4.29.0 or later to use host network mode on macOS. '
             'See https://github.com/docker/roadmap/issues/238#issuecomment-2044688144 for more information.'

+ 3 - 1
opendevin/runtime/client/client.py

@@ -64,7 +64,9 @@ class RuntimeClient:
         self.__bash_PS1 = r'[PEXPECT_BEGIN] \u@\h:\w [PEXPECT_END]'
 
         # This should NOT match "PS1=\u@\h:\w [PEXPECT]$" when `env` is executed
-        self.__bash_expect_regex = r'\[PEXPECT_BEGIN\] ([a-z_][a-z0-9_-]*)@([a-zA-Z][a-zA-Z0-9.-]*):(.+) \[PEXPECT_END\]'
+        self.__bash_expect_regex = (
+            r'\[PEXPECT_BEGIN\] ([a-z0-9_-]*)@([a-zA-Z0-9.-]*):(.+) \[PEXPECT_END\]'
+        )
 
         self.shell.sendline(f'export PS1="{self.__bash_PS1}"')
         self.shell.expect(self.__bash_expect_regex)

+ 14 - 2
opendevin/runtime/client/runtime.py

@@ -118,6 +118,17 @@ class EventStreamRuntime(Runtime):
             if plugins is None:
                 plugins = []
             plugin_names = ' '.join([plugin.name for plugin in plugins])
+
+            network_mode: str | None = None
+            port_mapping: dict[str, int] | None = None
+            if self.sandbox_config.use_host_network:
+                network_mode = 'host'
+                logger.warn(
+                    'Using host network mode. If you are using MacOS, please make sure you have the latest version of Docker Desktop and enabled host network feature: https://docs.docker.com/network/drivers/host/#docker-desktop'
+                )
+            else:
+                port_mapping = {f'{self._port}/tcp': self._port}
+
             container = self.docker_client.containers.run(
                 self.container_image,
                 command=(
@@ -127,7 +138,8 @@ class EventStreamRuntime(Runtime):
                     f'--working-dir {sandbox_workspace_dir} '
                     f'--plugins {plugin_names}'
                 ),
-                network_mode='host',
+                network_mode=network_mode,
+                ports=port_mapping,
                 working_dir='/opendevin/code/',
                 name=self.container_name,
                 detach=True,
@@ -148,7 +160,7 @@ class EventStreamRuntime(Runtime):
         return self.session
 
     @tenacity.retry(
-        stop=tenacity.stop_after_attempt(5),
+        stop=tenacity.stop_after_attempt(10),
         wait=tenacity.wait_exponential(multiplier=2, min=4, max=600),
     )
     async def _wait_until_alive(self):

+ 1 - 3
opendevin/runtime/docker/ssh_box.py

@@ -120,7 +120,6 @@ class DockerSSHBox(Sandbox):
         workspace_mount_path: str,
         sandbox_workspace_dir: str,
         cache_dir: str,
-        use_host_network: bool,
         run_as_devin: bool,
         ssh_hostname: str = 'host.docker.internal',
         ssh_password: str | None = None,
@@ -131,7 +130,7 @@ class DockerSSHBox(Sandbox):
         self.workspace_mount_path = workspace_mount_path
         self.sandbox_workspace_dir = sandbox_workspace_dir
         self.cache_dir = cache_dir
-        self.use_host_network = use_host_network
+        self.use_host_network = config.use_host_network
         self.run_as_devin = run_as_devin
         logger.info(
             f'SSHBox is running as {"opendevin" if self.run_as_devin else "root"} user with USER_ID={config.user_id} in the sandbox'
@@ -641,7 +640,6 @@ if __name__ == '__main__':
             workspace_mount_path='/path/to/workspace',
             cache_dir='/path/to/cache',
             sandbox_workspace_dir='/sandbox',
-            use_host_network=False,
             persist_sandbox=False,
         )
     except Exception as e:

+ 0 - 1
opendevin/runtime/server/runtime.py

@@ -44,7 +44,6 @@ def create_sandbox(sid: str = 'default', box_type: str = 'ssh') -> Sandbox:
             workspace_mount_path=config.workspace_mount_path,
             sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
             cache_dir=config.cache_dir,
-            use_host_network=config.use_host_network,
             run_as_devin=config.run_as_devin,
             ssh_hostname=config.ssh_hostname,
             ssh_password=config.ssh_password,

+ 9 - 6
opendevin/runtime/utils/runtime_build.py

@@ -63,8 +63,10 @@ def _generate_dockerfile(
         dockerfile_content = (
             f'FROM {base_image}\n'
             # FIXME: make this more generic / cross-platform
-            'RUN apt update && apt install -y wget sudo\n'
-            'RUN apt-get update && apt-get install -y libgl1-mesa-glx\n'  # Extra dependency for OpenCV
+            # Install necessary packages
+            # libgl1-mesa-glx is extra dependency for OpenCV
+            'RUN apt-get update && apt-get install -y wget sudo libgl1-mesa-glx\n'
+            'RUN apt-get clean && rm -rf /var/lib/apt/lists/*\n'  # Clean up the apt cache to reduce image size
             'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs\n'
             'RUN echo "" > /opendevin/bash.bashrc\n'
             'RUN if [ ! -d /opendevin/miniforge3 ]; then \\\n'
@@ -150,13 +152,14 @@ def _build_sandbox_image(
                 else:
                     logger.info(str(log))
 
+        # check if the image is built successfully
+        image = docker_client.images.get(target_image_name)
+        if image is None:
+            raise RuntimeError(f'Build failed: Image {target_image_name} not found')
         logger.info(f'Image {target_image_name} built successfully')
     except docker.errors.BuildError as e:
         logger.error(f'Sandbox image build failed: {e}')
         raise e
-    except Exception as e:
-        logger.error(f'An error occurred during sandbox image build: {e}')
-        raise e
 
 
 def _get_new_image_name(base_image: str, dev_mode: bool = False) -> str:
@@ -200,7 +203,7 @@ def build_runtime_image(
         docker_client.images.pull(new_image_name)
     except Exception as e:
         logger.info(f'Error pulling image {new_image_name}, building it from scratch')
-        logger.error(f'Error: {e}')
+        logger.info(f'Non-fatal error: {e}')
 
     # Detect if the sandbox image is built
     image_exists = _check_image_exists(new_image_name, docker_client)

+ 0 - 1
tests/unit/test_ipython.py

@@ -90,7 +90,6 @@ def test_sandbox_jupyter_plugin_backticks(temp_dir):
             workspace_mount_path=config.workspace_mount_path,
             sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
             cache_dir=config.cache_dir,
-            use_host_network=config.use_host_network,
             run_as_devin=config.run_as_devin,
             ssh_hostname=config.ssh_hostname,
             ssh_password=config.ssh_password,

+ 8 - 1
tests/unit/test_runtime.py

@@ -29,7 +29,9 @@ def temp_dir(monkeypatch):
 
 
 async def _load_runtime(box_class, event_stream, plugins, sid):
-    sandbox_config = SandboxConfig()
+    sandbox_config = SandboxConfig(
+        use_host_network=False,
+    )
     if box_class == EventStreamRuntime:
         runtime = EventStreamRuntime(
             sandbox_config=sandbox_config,
@@ -85,6 +87,8 @@ async def test_env_vars_os_environ():
                 obs.content.strip().split('\n\r')[0].strip() == 'BAZ'
             ), f'Output: [{obs.content}] for {box_class}'
 
+            await runtime.close()
+
 
 @pytest.mark.asyncio
 async def test_env_vars_runtime_add_env_var():
@@ -105,6 +109,7 @@ async def test_env_vars_runtime_add_env_var():
         assert (
             obs.content.strip().split('\r\n')[0].strip() == 'abc"def'
         ), f'Output: [{obs.content}] for {box_class}'
+        await runtime.close()
 
 
 @pytest.mark.asyncio
@@ -126,6 +131,7 @@ async def test_env_vars_runtime_add_multiple_env_vars():
         assert (
             obs.content.strip().split('\r\n')[0].strip() == 'abc"def xyz'
         ), f'Output: [{obs.content}] for {box_class}'
+        await runtime.close()
 
 
 @pytest.mark.asyncio
@@ -148,6 +154,7 @@ async def test_env_vars_runtime_add_env_var_overwrite():
             assert (
                 obs.content.strip().split('\r\n')[0].strip() == 'xyz'
             ), f'Output: [{obs.content}] for {box_class}'
+            await runtime.close()
 
 
 @pytest.mark.asyncio

+ 0 - 1
tests/unit/test_sandbox.py

@@ -26,7 +26,6 @@ def create_docker_box_from_app_config(
         workspace_mount_path=path,
         sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
         cache_dir=config.cache_dir,
-        use_host_network=config.use_host_network,
         run_as_devin=True,
         ssh_hostname=config.ssh_hostname,
         ssh_password=config.ssh_password,