Ver código fonte

[feat] Integrate BrowserGym (#1452)

* add a single-threaded server serving browsergym

* update poetry

* update browser page content

* add import to make sure browsergym environments are registered properly

* remove flask server, use multiprocess impl and Pipe

* fix

* refactor BrowserEnv

* update browser action and obs to include more complete info

* fix screenshot

* update poetry lock

* add playwright install to workflow

* update

* add better html to text conversion

* update for better text conversion to maintain parity with the current handling of browseurlaction

* update

* update poetry

* update multiprocessing mp

* fix multiprocessing

* update

* update github workflow

---------

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
Frank Xu 1 ano atrás
pai
commit
836864fa88

+ 1 - 0
.github/workflows/dummy-agent-test.yml

@@ -15,6 +15,7 @@ jobs:
         run: |
           curl -sSL https://install.python-poetry.org | python3 -
           poetry install --without evaluation
+          poetry run playwright install --with-deps chromium
           wget https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json -P /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/1_Pooling/
       - name: Run tests
         run: |

+ 1 - 1
Makefile

@@ -159,7 +159,7 @@ build-frontend:
 # Start backend
 start-backend:
 	@echo "$(YELLOW)Starting backend...$(RESET)"
-	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude workspace/*
+	@poetry run uvicorn opendevin.server.listen:app --port $(BACKEND_PORT) --reload --reload-exclude "workspace/*"
 
 # Start frontend
 start-frontend:

+ 20 - 0
docs/modules/python/opendevin/browser/browser_env.md

@@ -0,0 +1,20 @@
+---
+sidebar_label: browser_env
+title: opendevin.browser.browser_env
+---
+
+## BrowserEnv Objects
+
+```python
+class BrowserEnv()
+```
+
+#### image\_to\_png\_base64\_url
+
+```python
+@staticmethod
+def image_to_png_base64_url(image: np.ndarray | Image.Image)
+```
+
+Convert a numpy array to a base64 encoded png image url.
+

+ 7 - 0
docs/modules/python/sidebar.json

@@ -80,6 +80,13 @@
           "label": "opendevin.action",
           "type": "category"
         },
+        {
+          "items": [
+            "python/opendevin/browser/browser_env"
+          ],
+          "label": "opendevin.browser",
+          "type": "category"
+        },
         {
           "items": [
             "python/opendevin/controller/agent_controller"

+ 15 - 26
opendevin/action/browse.py

@@ -1,10 +1,7 @@
-import base64
 import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-from playwright.async_api import async_playwright
-
 from opendevin.observation import BrowserOutputObservation
 from opendevin.schema import ActionType
 
@@ -25,29 +22,21 @@ class BrowseURLAction(ExecutableAction):
         if not asked_url.startswith('http'):
             asked_url = os.path.abspath(os.curdir) + self.url
         try:
-            async with async_playwright() as p:
-                browser = await p.chromium.launch()
-                page = await browser.new_page()
-                response = await page.goto(asked_url)
-                try:
-                    # domcontentloaded: Wait for the DOMContentLoaded event to be fired.
-                    # load: Wait for the load event to be fired.
-                    # networkidle: Wait until there are no more network connections
-                    await page.wait_for_load_state('networkidle', timeout=3000)
-                except TimeoutError:
-                    pass
-                # content = await page.content()
-                inner_text = await page.evaluate('() => document.body.innerText')
-                screenshot_bytes = await page.screenshot(full_page=True)
-                await browser.close()
-
-                screenshot_base64 = base64.b64encode(screenshot_bytes).decode('utf-8')
-                return BrowserOutputObservation(
-                    content=inner_text,  # HTML content of the page
-                    screenshot=screenshot_base64,  # Base64-encoded screenshot
-                    url=asked_url,
-                    status_code=response.status if response else 0,  # HTTP status code
-                )
+            # action in BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/functions.py
+            action_str = f'goto("{asked_url}")'
+            # obs provided by BrowserGym: see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/env.py#L396
+            obs = controller.browser.step(action_str)
+            return BrowserOutputObservation(
+                content=obs['text_content'],  # text content of the page
+                open_pages_urls=obs['open_pages_urls'],  # list of open pages
+                active_page_index=obs['active_page_index'],  # index of the active page
+                dom_object=obs['dom_object'],  # DOM object
+                axtree_object=obs['axtree_object'],  # accessibility tree object
+                last_browser_action=obs['last_action'],  # last browser env action performed
+                focused_element_bid=obs['focused_element_bid'],  # focused element bid
+                screenshot=obs['screenshot'],  # base64-encoded screenshot, png
+                url=asked_url,
+            )
         except Exception as e:
             return BrowserOutputObservation(
                 content=str(e), screenshot='', error=True, url=asked_url

+ 0 - 0
opendevin/browser/__init__.py


+ 102 - 0
opendevin/browser/browser_env.py

@@ -0,0 +1,102 @@
+import atexit
+import base64
+import io
+import multiprocessing
+import time
+import uuid
+
+import browsergym.core  # noqa F401 (we register the openended task as a gym environment)
+import gymnasium as gym
+import html2text
+import numpy as np
+from browsergym.utils.obs import flatten_dom_to_str
+from PIL import Image
+
+from opendevin.logger import opendevin_logger as logger
+
+
+class BrowserException(Exception):
+    pass
+
+class BrowserEnv:
+
+    def __init__(self):
+        self.html_text_converter = html2text.HTML2Text()
+        # ignore links and images
+        self.html_text_converter.ignore_links = True
+        self.html_text_converter.ignore_images = True
+        # use alt text for images
+        self.html_text_converter.images_to_alt = True
+        # disable auto text wrapping
+        self.html_text_converter.body_width = 0
+        # Initialize browser environment process
+        multiprocessing.set_start_method('spawn', force=True)
+        self.browser_side, self.agent_side = multiprocessing.Pipe()
+        self.process = multiprocessing.Process(target=self.browser_process,)
+        logger.info('Starting browser env...')
+        self.process.start()
+        atexit.register(self.close)
+
+    def browser_process(self):
+        env = gym.make(
+            'browsergym/openended',
+            start_url='about:blank',
+            wait_for_user_message=False,
+            headless=True,
+            disable_env_checker=True,
+        )
+        obs, info = env.reset()
+        logger.info('Browser env started.')
+        while True:
+            try:
+                if self.browser_side.poll(timeout=0.01):
+                    unique_request_id , action_data = self.browser_side.recv()
+                    # shutdown the browser environment
+                    if unique_request_id == 'SHUTDOWN':
+                        env.close()
+                        return
+                    action = action_data['action']
+                    obs, reward, terminated, truncated, info = env.step(action)
+                    # add text content of the page
+                    html_str = flatten_dom_to_str(obs['dom_object'])
+                    obs['text_content'] = self.html_text_converter.handle(html_str)
+                    # make observation serializable
+                    obs['screenshot'] = self.image_to_png_base64_url(obs['screenshot'])
+                    obs['active_page_index'] = obs['active_page_index'].item()
+                    obs['elapsed_time'] = obs['elapsed_time'].item()
+                    self.browser_side.send((unique_request_id, obs))
+            except KeyboardInterrupt:
+                logger.info('Browser env process interrupted by user.')
+                return
+
+    def step(self, action_str: str, timeout: float = 10) -> dict:
+        unique_request_id = str(uuid.uuid4())
+        self.agent_side.send((unique_request_id, {'action': action_str}))
+        start_time = time.time()
+        while True:
+            if time.time() - start_time > timeout:
+                raise TimeoutError('Browser environment took too long to respond.')
+            if self.agent_side.poll(timeout=0.01):
+                response_id, obs = self.agent_side.recv()
+                if response_id == unique_request_id:
+                    if obs['last_action_error']:
+                        raise BrowserException(obs['last_action_error'])
+                    return obs
+
+    def close(self):
+        self.agent_side.send(('SHUTDOWN', None))
+        self.process.join()
+
+    @staticmethod
+    def image_to_png_base64_url(image: np.ndarray | Image.Image):
+        """Convert a numpy array to a base64 encoded png image url."""
+
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image)
+        if image.mode in ('RGBA', 'LA'):
+            image = image.convert('RGB')
+        buffered = io.BytesIO()
+        image.save(buffered, format='PNG')
+
+        image_base64 = base64.b64encode(buffered.getvalue()).decode()
+        return f'{image_base64}'

+ 5 - 0
opendevin/controller/agent_controller.py

@@ -12,6 +12,7 @@ from opendevin.action import (
 )
 from opendevin.action.tasks import TaskStateChangedAction
 from opendevin.agent import Agent
+from opendevin.browser.browser_env import BrowserEnv
 from opendevin.controller.action_manager import ActionManager
 from opendevin.exceptions import (
     AgentMalformedActionError,
@@ -43,6 +44,7 @@ class AgentController:
     max_iterations: int
     action_manager: ActionManager
     callbacks: List[Callable]
+    browser: BrowserEnv
 
     delegate: 'AgentController | None' = None
     state: State | None = None
@@ -67,6 +69,9 @@ class AgentController:
         self.callbacks = callbacks
         # Initialize agent-required plugins for sandbox (if any)
         self.action_manager.init_sandbox_plugins(agent.sandbox_plugins)
+        # Initialize browser environment
+        self.browser = BrowserEnv()
+
 
         if isinstance(agent, CodeActAgent) and not isinstance(self.action_manager.sandbox, DockerSSHBox):
             logger.warning('CodeActAgent requires DockerSSHBox as sandbox! Using other sandbox that are not stateful (LocalBox, DockerExecBox) will not work properly.')

+ 20 - 1
opendevin/observation/browse.py

@@ -1,4 +1,4 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 from opendevin.schema import ObservationType
 
@@ -16,6 +16,25 @@ class BrowserOutputObservation(Observation):
     status_code: int = 200
     error: bool = False
     observation: str = ObservationType.BROWSE
+    # do not include in the memory
+    open_pages_urls: list = field(default_factory=list)
+    active_page_index: int = -1
+    dom_object: dict = field(default_factory=dict)
+    axtree_object: dict = field(default_factory=dict)
+    last_browser_action: str = ''
+    focused_element_bid: str = ''
+
+    def to_memory(self) -> dict:
+        memory_dict = super().to_memory()
+        # remove some fields from the memory, as currently they are too big for LLMs
+        # TODO: find a more elegant way to handle this
+        memory_dict['extras'].pop('dom_object', None)
+        memory_dict['extras'].pop('axtree_object', None)
+        memory_dict['extras'].pop('open_pages_urls', None)
+        memory_dict['extras'].pop('active_page_index', None)
+        memory_dict['extras'].pop('last_browser_action', None)
+        memory_dict['extras'].pop('focused_element_bid', None)
+        return memory_dict
 
     @property
     def message(self) -> str:

Diferenças do arquivo suprimidas por serem muito extensas
+ 645 - 194
poetry.lock


+ 2 - 1
pyproject.toml

@@ -22,7 +22,8 @@ uvicorn = "*"
 types-toml = "*"
 numpy = "*"
 json-repair = "*"
-playwright = "*"
+browsergym = "*" # integrate browsergym as the browsing interface
+html2text = "*"
 e2b = "^0.14.13"
 pexpect = "*"
 jinja2 = "^3.1.3"

Alguns arquivos não foram mostrados porque muitos arquivos mudaram nesse diff