před 1 rokem · b7061f4497
--- a/opendevin/core/config.py
+++ b/opendevin/core/config.py
@@ -154,6 +154,9 @@ class SandboxConfig(metaclass=Singleton):
 
				         initialize_plugins: Whether to initialize plugins.
			
 
				         update_source_code: Whether to update the source code in the EventStreamRuntime.
			
 
				             Used for development of EventStreamRuntime.
			
 
				+        browsergym_eval_env: The BrowserGym environment to use for evaluation.
			
 
				+            Default is None for general purpose browsing. Check evaluation/miniwob and evaluation/webarena for examples.
			
 
				+
			
 
				     """
			
 
				 
			
 
				     box_type: str = 'ssh'
			
@@ -170,6 +173,7 @@ class SandboxConfig(metaclass=Singleton):
 
				     use_host_network: bool = False
			
 
				     initialize_plugins: bool = True
			
 
				     update_source_code: bool = False
			
 
				+    browsergym_eval_env: str | None = None
			
 
				 
			
 
				     def defaults_to_dict(self) -> dict:
			
 
				         """Serialize fields to a dict for the frontend, including type hints, defaults, and whether it's optional."""
			
--- a/opendevin/events/observation/browse.py
+++ b/opendevin/events/observation/browse.py
@@ -11,7 +11,6 @@ class BrowserOutputObservation(Observation):
 
				 
			
 
				     url: str
			
 
				     screenshot: str = field(repr=False)  # don't show in repr
			
 
				-    status_code: int = 200
			
 
				     error: bool = False
			
 
				     observation: str = ObservationType.BROWSE
			
 
				     # do not include in the memory
			
@@ -34,12 +33,12 @@ class BrowserOutputObservation(Observation):
 
				         return (
			
 
				             '**BrowserOutputObservation**\n'
			
 
				             f'URL: {self.url}\n'
			
 
				-            f'Status code: {self.status_code}\n'
			
 
				             f'Error: {self.error}\n'
			
 
				             f'Open pages: {self.open_pages_urls}\n'
			
 
				             f'Active page index: {self.active_page_index}\n'
			
 
				             f'Last browser action: {self.last_browser_action}\n'
			
 
				             f'Last browser action error: {self.last_browser_action_error}\n'
			
 
				             f'Focused element bid: {self.focused_element_bid}\n'
			
 
				+            f'axTree: {self.axtree_object}\n'
			
 
				             f'CONTENT: {self.content}\n'
			
 
				         )
			
--- a/opendevin/runtime/browser/browser_env.py
+++ b/opendevin/runtime/browser/browser_env.py
@@ -3,7 +3,6 @@ import base64
 
				 import io
			
 
				 import json
			
 
				 import multiprocessing
			
 
				-import os
			
 
				 import time
			
 
				 import uuid
			
 
				 
			
@@ -18,41 +17,24 @@ from PIL import Image
 
				 from opendevin.core.exceptions import BrowserInitException
			
 
				 from opendevin.core.logger import opendevin_logger as logger
			
 
				 
			
 
				+BROWSER_EVAL_GET_GOAL_ACTION = 'GET_EVAL_GOAL'
			
 
				+BROWSER_EVAL_GET_REWARDS_ACTION = 'GET_EVAL_REWARDS'
			
 
				+
			
 
				 
			
 
				 class BrowserEnv:
			
 
				-    def __init__(
			
 
				-        self,
			
 
				-        browsergym_eval: str = '',
			
 
				-        browsergym_eval_save_dir: str = '',
			
 
				-    ):
			
 
				+    def __init__(self, browsergym_eval_env: str | None = None):
			
 
				         self.html_text_converter = self.get_html_text_converter()
			
 
				         self.eval_mode = False
			
 
				         self.eval_dir = ''
			
 
				-        # EVAL only: browsergym_eval and browsergym_eval_save_dir must be provided for evaluation
			
 
				-        self.browsergym_eval = browsergym_eval
			
 
				-        self.browsergym_eval_save_dir = browsergym_eval_save_dir
			
 
				-        if self.browsergym_eval:
			
 
				-            assert (
			
 
				-                self.browsergym_eval_save_dir
			
 
				-            ), 'browsergym_eval_save_dir must be provided for evaluation.'
			
 
				-            self.eval_mode = True
			
 
				-            self.eval_dir = os.path.join(
			
 
				-                self.browsergym_eval_save_dir, self.browsergym_eval.split('/')[1]
			
 
				-            )
			
 
				-            os.makedirs(self.eval_dir, exist_ok=True)
			
 
				+
			
 
				+        # EVAL only: browsergym_eval_env must be provided for evaluation
			
 
				+        self.browsergym_eval_env = browsergym_eval_env
			
 
				+        self.eval_mode = bool(browsergym_eval_env)
			
 
				+
			
 
				         # Initialize browser environment process
			
 
				         multiprocessing.set_start_method('spawn', force=True)
			
 
				         self.browser_side, self.agent_side = multiprocessing.Pipe()
			
 
				 
			
 
				-        try:
			
 
				-            self.original_cwd = os.getcwd()
			
 
				-        except FileNotFoundError:
			
 
				-            logger.warning(
			
 
				-                'Current working directory does not exist. Using /tmp as fallback.'
			
 
				-            )
			
 
				-            self.original_cwd = '/tmp'
			
 
				-            os.chdir('/tmp')
			
 
				-
			
 
				         self.init_browser()
			
 
				         atexit.register(self.close)
			
 
				 
			
@@ -74,17 +56,6 @@ class BrowserEnv:
 
				     )
			
 
				     def init_browser(self):
			
 
				         logger.info('Starting browser env...')
			
 
				-
			
 
				-        # Ensure we're in a valid directory before starting the process
			
 
				-        try:
			
 
				-            os.chdir(self.original_cwd)
			
 
				-            logger.debug(f'Changed back to original directory: {self.original_cwd}')
			
 
				-        except Exception as e:
			
 
				-            logger.error(f'Failed to change to original directory: {e}')
			
 
				-            # If we can't change to the original directory, try to use a known valid directory
			
 
				-            os.chdir('/tmp')
			
 
				-            logger.debug('Changed to /tmp directory as fallback')
			
 
				-
			
 
				         try:
			
 
				             self.process = multiprocessing.Process(target=self.browser_process)
			
 
				             self.process.start()
			
@@ -98,8 +69,17 @@ class BrowserEnv:
 
				 
			
 
				     def browser_process(self):
			
 
				         if self.eval_mode:
			
 
				-            logger.info('Creating browser env for evaluation purpose.')
			
 
				-            env = gym.make(self.browsergym_eval)
			
 
				+            assert self.browsergym_eval_env is not None
			
 
				+            logger.info('Initializing browser env for web browsing evaluation.')
			
 
				+            if 'webarena' in self.browsergym_eval_env:
			
 
				+                import browsergym.webarena  # noqa F401 register webarena tasks as gym environments
			
 
				+            elif 'miniwob' in self.browsergym_eval_env:
			
 
				+                import browsergym.miniwob  # noqa F401 register miniwob tasks as gym environments
			
 
				+            else:
			
 
				+                raise ValueError(
			
 
				+                    f'Unsupported browsergym eval env: {self.browsergym_eval_env}'
			
 
				+                )
			
 
				+            env = gym.make(self.browsergym_eval_env)
			
 
				         else:
			
 
				             env = gym.make(
			
 
				                 'browsergym/openended',
			
@@ -108,20 +88,22 @@ class BrowserEnv:
 
				                 headless=True,
			
 
				                 disable_env_checker=True,
			
 
				             )
			
 
				+
			
 
				         obs, info = env.reset()
			
 
				-        # EVAL only: save the goal into file for evaluation
			
 
				+
			
 
				+        # EVAL ONLY: save the goal into file for evaluation
			
 
				+        self.eval_goal = None
			
 
				+        self.eval_rewards: list[float] = []
			
 
				         if self.eval_mode:
			
 
				-            rewards = []  # store rewards if in eval mode
			
 
				-            logger.info(obs['goal'])
			
 
				-            with open(
			
 
				-                os.path.join(self.eval_dir, 'goal.txt'), 'w', encoding='utf-8'
			
 
				-            ) as f:
			
 
				-                f.write(obs['goal'])
			
 
				+            logger.info(f"Browsing goal: {obs['goal']}")
			
 
				+            self.eval_goal = obs['goal']
			
 
				+
			
 
				         logger.info('Browser env started.')
			
 
				         while True:
			
 
				             try:
			
 
				                 if self.browser_side.poll(timeout=0.01):
			
 
				                     unique_request_id, action_data = self.browser_side.recv()
			
 
				+
			
 
				                     # shutdown the browser environment
			
 
				                     if unique_request_id == 'SHUTDOWN':
			
 
				                         logger.info('SHUTDOWN recv, shutting down browser env...')
			
@@ -130,17 +112,29 @@ class BrowserEnv:
 
				                     elif unique_request_id == 'IS_ALIVE':
			
 
				                         self.browser_side.send(('ALIVE', None))
			
 
				                         continue
			
 
				+
			
 
				+                    # EVAL ONLY: Get evaluation info
			
 
				+                    if action_data['action'] == BROWSER_EVAL_GET_GOAL_ACTION:
			
 
				+                        self.browser_side.send(
			
 
				+                            (unique_request_id, {'text_content': self.eval_goal})
			
 
				+                        )
			
 
				+                        continue
			
 
				+                    elif action_data['action'] == BROWSER_EVAL_GET_REWARDS_ACTION:
			
 
				+                        self.browser_side.send(
			
 
				+                            (
			
 
				+                                unique_request_id,
			
 
				+                                {'text_content': json.dumps(self.eval_rewards)},
			
 
				+                            )
			
 
				+                        )
			
 
				+                        continue
			
 
				+
			
 
				                     action = action_data['action']
			
 
				                     obs, reward, terminated, truncated, info = env.step(action)
			
 
				-                    # EVAL only: save the rewards into file for evaluation
			
 
				+
			
 
				+                    # EVAL ONLY: Save the rewards into file for evaluation
			
 
				                     if self.eval_mode:
			
 
				-                        rewards.append(reward)
			
 
				-                        with open(
			
 
				-                            os.path.join(self.eval_dir, 'rewards.json'),
			
 
				-                            'w',
			
 
				-                            encoding='utf-8',
			
 
				-                        ) as f:
			
 
				-                            f.write(json.dumps(rewards))
			
 
				+                        self.eval_rewards.append(reward)
			
 
				+
			
 
				                     # add text content of the page
			
 
				                     html_str = flatten_dom_to_str(obs['dom_object'])
			
 
				                     obs['text_content'] = self.html_text_converter.handle(html_str)
			
@@ -158,6 +152,7 @@ class BrowserEnv:
 
				                 return
			
 
				 
			
 
				     def step(self, action_str: str, timeout: float = 30) -> dict:
			
 
				+        """Execute an action in the browser environment and return the observation."""
			
 
				         unique_request_id = str(uuid.uuid4())
			
 
				         self.agent_side.send((unique_request_id, {'action': action_str}))
			
 
				         start_time = time.time()
			
--- a/opendevin/runtime/browser/utils.py
+++ b/opendevin/runtime/browser/utils.py
@@ -32,21 +32,23 @@ async def browse(
 
				         obs = browser.step(action_str)
			
 
				         return BrowserOutputObservation(
			
 
				             content=obs['text_content'],  # text content of the page
			
 
				-            open_pages_urls=obs['open_pages_urls'],  # list of open pages
			
 
				-            active_page_index=obs['active_page_index'],  # index of the active page
			
 
				-            dom_object=obs['dom_object'],  # DOM object
			
 
				-            axtree_object=obs['axtree_object'],  # accessibility tree object
			
 
				-            extra_element_properties=obs[
			
 
				-                'extra_element_properties'
			
 
				-            ],  # extra element properties
			
 
				-            last_browser_action=obs['last_action'],  # last browser env action performed
			
 
				-            focused_element_bid=obs['focused_element_bid'],  # focused element bid
			
 
				-            screenshot=obs['screenshot'],  # base64-encoded screenshot, png
			
 
				-            url=obs['url'],  # URL of the page
			
 
				-            error=True if obs['last_action_error'] else False,  # error flag
			
 
				-            last_browser_action_error=obs[
			
 
				-                'last_action_error'
			
 
				-            ],  # last browser env action error
			
 
				+            url=obs.get('url', ''),  # URL of the page
			
 
				+            screenshot=obs.get('screenshot', None),  # base64-encoded screenshot, png
			
 
				+            open_pages_urls=obs.get('open_pages_urls', []),  # list of open pages
			
 
				+            active_page_index=obs.get(
			
 
				+                'active_page_index', -1
			
 
				+            ),  # index of the active page
			
 
				+            dom_object=obs.get('dom_object', {}),  # DOM object
			
 
				+            axtree_object=obs.get('axtree_object', {}),  # accessibility tree object
			
 
				+            extra_element_properties=obs.get('extra_element_properties', {}),
			
 
				+            focused_element_bid=obs.get(
			
 
				+                'focused_element_bid', None
			
 
				+            ),  # focused element bid
			
 
				+            last_browser_action=obs.get(
			
 
				+                'last_action', ''
			
 
				+            ),  # last browser env action performed
			
 
				+            last_browser_action_error=obs.get('last_action_error', ''),
			
 
				+            error=True if obs.get('last_action_error', '') else False,  # error flag
			
 
				         )
			
 
				     except Exception as e:
			
 
				         return BrowserOutputObservation(
			
--- a/opendevin/runtime/client/client.py
+++ b/opendevin/runtime/client/client.py
@@ -64,7 +64,12 @@ class RuntimeClient:
 
				     """
			
 
				 
			
 
				     def __init__(
			
 
				-        self, plugins_to_load: list[Plugin], work_dir: str, username: str, user_id: int
			
 
				+        self,
			
 
				+        plugins_to_load: list[Plugin],
			
 
				+        work_dir: str,
			
 
				+        username: str,
			
 
				+        user_id: int,
			
 
				+        browsergym_eval_env: str | None,
			
 
				     ) -> None:
			
 
				         self.plugins_to_load = plugins_to_load
			
 
				         self.username = username
			
@@ -74,7 +79,7 @@ class RuntimeClient:
 
				         self._init_bash_shell(self.pwd, self.username)
			
 
				         self.lock = asyncio.Lock()
			
 
				         self.plugins: dict[str, Plugin] = {}
			
 
				-        self.browser = BrowserEnv()
			
 
				+        self.browser = BrowserEnv(browsergym_eval_env)
			
 
				 
			
 
				     async def ainit(self):
			
 
				         for plugin in self.plugins_to_load:
			
@@ -362,6 +367,12 @@ if __name__ == '__main__':
 
				         '--username', type=str, help='User to run as', default='opendevin'
			
 
				     )
			
 
				     parser.add_argument('--user-id', type=int, help='User ID to run as', default=1000)
			
 
				+    parser.add_argument(
			
 
				+        '--browsergym-eval-env',
			
 
				+        type=str,
			
 
				+        help='BrowserGym environment used for browser evaluation',
			
 
				+        default=None,
			
 
				+    )
			
 
				     # example: python client.py 8000 --working-dir /workspace --plugins JupyterRequirement
			
 
				     args = parser.parse_args()
			
 
				 
			
@@ -382,6 +393,7 @@ if __name__ == '__main__':
 
				             work_dir=args.working_dir,
			
 
				             username=args.username,
			
 
				             user_id=args.user_id,
			
 
				+            browsergym_eval_env=args.browsergym_eval_env,
			
 
				         )
			
 
				         await client.ainit()
			
 
				         yield
			
--- a/opendevin/runtime/client/runtime.py
+++ b/opendevin/runtime/client/runtime.py
@@ -147,6 +147,12 @@ class EventStreamRuntime(Runtime):
 
				 
			
 
				             logger.info(f'run_as_devin: `{self.config.run_as_devin}`')
			
 
				 
			
 
				+            if self.config.sandbox.browsergym_eval_env is not None:
			
 
				+                browsergym_arg = (
			
 
				+                    f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}'
			
 
				+                )
			
 
				+            else:
			
 
				+                browsergym_arg = ''
			
 
				             container = self.docker_client.containers.run(
			
 
				                 self.container_image,
			
 
				                 command=(
			
@@ -156,7 +162,8 @@ class EventStreamRuntime(Runtime):
 
				                     f'--working-dir {sandbox_workspace_dir} '
			
 
				                     f'{plugin_arg}'
			
 
				                     f'--username {"opendevin" if self.config.run_as_devin else "root"} '
			
 
				-                    f'--user-id {self.config.sandbox.user_id}'
			
 
				+                    f'--user-id {self.config.sandbox.user_id} '
			
 
				+                    f'{browsergym_arg}'
			
 
				                 ),
			
 
				                 network_mode=network_mode,
			
 
				                 ports=port_mapping,
			
--- a/tests/unit/test_runtime.py
+++ b/tests/unit/test_runtime.py
@@ -1,6 +1,7 @@
 
				 """Test the EventStreamRuntime, which connects to the RuntimeClient running in the sandbox."""
			
 
				 
			
 
				 import asyncio
			
 
				+import json
			
 
				 import os
			
 
				 import tempfile
			
 
				 import time
			
@@ -13,6 +14,7 @@ from opendevin.core.config import AppConfig, SandboxConfig, load_from_env
 
				 from opendevin.core.logger import opendevin_logger as logger
			
 
				 from opendevin.events import EventStream
			
 
				 from opendevin.events.action import (
			
 
				+    BrowseInteractiveAction,
			
 
				     BrowseURLAction,
			
 
				     CmdRunAction,
			
 
				     FileReadAction,
			
@@ -29,6 +31,7 @@ from opendevin.events.observation import (
 
				 )
			
 
				 from opendevin.runtime.client.runtime import EventStreamRuntime
			
 
				 from opendevin.runtime.plugins import AgentSkillsRequirement, JupyterRequirement
			
 
				+from opendevin.runtime.runtime import Runtime
			
 
				 from opendevin.runtime.server.runtime import ServerRuntime
			
 
				 from opendevin.storage import get_file_store
			
 
				 
			
@@ -95,7 +98,8 @@ async def _load_runtime(
 
				     run_as_devin: bool = True,
			
 
				     enable_auto_lint: bool = False,
			
 
				     container_image: str | None = None,
			
 
				-):
			
 
				+    browsergym_eval_env: str | None = None,
			
 
				+) -> Runtime:
			
 
				     sid = 'test'
			
 
				     cli_session = 'main_test'
			
 
				     # AgentSkills need to be initialized **before** Jupyter
			
@@ -104,7 +108,10 @@ async def _load_runtime(
 
				     config = AppConfig(
			
 
				         workspace_base=temp_dir,
			
 
				         workspace_mount_path=temp_dir,
			
 
				-        sandbox=SandboxConfig(use_host_network=True),
			
 
				+        sandbox=SandboxConfig(
			
 
				+            use_host_network=True,
			
 
				+            browsergym_eval_env=browsergym_eval_env,
			
 
				+        ),
			
 
				     )
			
 
				     load_from_env(config, os.environ)
			
 
				     config.run_as_devin = run_as_devin
			
@@ -120,7 +127,9 @@ async def _load_runtime(
 
				         # NOTE: we will use the default container image specified in the config.sandbox
			
 
				         # if it is an official od_runtime image.
			
 
				         cur_container_image = config.sandbox.container_image
			
 
				-        if 'od_runtime' not in cur_container_image:
			
 
				+        if 'od_runtime' not in cur_container_image and cur_container_image not in {
			
 
				+            'xingyaoww/od-eval-miniwob:v1.0'
			
 
				+        }:  # a special exception list
			
 
				             cur_container_image = 'ubuntu:22.04'
			
 
				             logger.warning(
			
 
				                 f'`{config.sandbox.container_image}` is not an od_runtime image. Will use `{cur_container_image}` as the container image for testing.'
			
@@ -387,7 +396,6 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin):
 
				 
			
 
				     assert isinstance(obs, BrowserOutputObservation)
			
 
				     assert 'http://localhost:8000' in obs.url
			
 
				-    assert obs.status_code == 200
			
 
				     assert not obs.error
			
 
				     assert obs.open_pages_urls == ['http://localhost:8000/']
			
 
				     assert obs.active_page_index == 0
			
@@ -407,6 +415,53 @@ async def test_simple_browse(temp_dir, box_class, run_as_devin):
 
				     await asyncio.sleep(1)
			
 
				 
			
 
				 
			
 
				+@pytest.mark.asyncio
			
 
				+async def test_browsergym_eval_env(temp_dir):
			
 
				+    runtime = await _load_runtime(
			
 
				+        temp_dir,
			
 
				+        # only supported in event stream runtime
			
 
				+        box_class=EventStreamRuntime,
			
 
				+        run_as_devin=False,  # need root permission to access file
			
 
				+        container_image='xingyaoww/od-eval-miniwob:v1.0',
			
 
				+        browsergym_eval_env='browsergym/miniwob.choose-list',
			
 
				+    )
			
 
				+    from opendevin.runtime.browser.browser_env import (
			
 
				+        BROWSER_EVAL_GET_GOAL_ACTION,
			
 
				+        BROWSER_EVAL_GET_REWARDS_ACTION,
			
 
				+    )
			
 
				+
			
 
				+    # Test browse
			
 
				+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_GOAL_ACTION)
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = await runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+
			
 
				+    assert isinstance(obs, BrowserOutputObservation)
			
 
				+    assert not obs.error
			
 
				+    assert 'Select' in obs.content
			
 
				+    assert 'from the list and click Submit' in obs.content
			
 
				+
			
 
				+    # Make sure the browser can produce observation in eva[l
			
 
				+    action = BrowseInteractiveAction(browser_actions='noop()')
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = await runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert (
			
 
				+        obs.url.strip()
			
 
				+        == 'file:///miniwob-plusplus/miniwob/html/miniwob/choose-list.html'
			
 
				+    )
			
 
				+
			
 
				+    # Make sure the rewards are working
			
 
				+    action = BrowseInteractiveAction(browser_actions=BROWSER_EVAL_GET_REWARDS_ACTION)
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = await runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert json.loads(obs.content) == [0.0]
			
 
				+
			
 
				+    await runtime.close()
			
 
				+    await asyncio.sleep(1)
			
 
				+
			
 
				+
			
 
				 @pytest.mark.asyncio
			
 
				 async def test_single_multiline_command(temp_dir, box_class):
			
 
				     runtime = await _load_runtime(temp_dir, box_class)