1 anno fa · 1fe290adf9
--- a/agenthub/__init__.py
+++ b/agenthub/__init__.py
@@ -10,6 +10,7 @@ load_dotenv()
 
				 
			
 
				 from . import (  # noqa: E402
			
 
				     SWE_agent,
			
 
				+    browsing_agent,
			
 
				     codeact_agent,
			
 
				     delegator_agent,
			
 
				     dummy_agent,
			
@@ -24,6 +25,7 @@ __all__ = [
 
				     'SWE_agent',
			
 
				     'delegator_agent',
			
 
				     'dummy_agent',
			
 
				+    'browsing_agent',
			
 
				 ]
			
 
				 
			
 
				 for agent in all_microagents.values():
			
--- a/agenthub/browsing_agent/README.md
+++ b/agenthub/browsing_agent/README.md
@@ -0,0 +1,16 @@
 
				+# Browsing Agent Framework
			
 
				+
			
 
				+This folder implements the basic BrowserGym [demo agent](https://github.com/ServiceNow/BrowserGym/tree/main/demo_agent) that enables full-featured web browsing.
			
 
				+
			
 
				+
			
 
				+## Test run
			
 
				+
			
 
				+Note that for browsing tasks, GPT-4 is usually a requirement to get reasonable results, due to the complexity of the web page structures.
			
 
				+
			
 
				+```
			
 
				+poetry run python ./opendevin/core/main.py \
			
 
				+           -i 5 \
			
 
				+           -t "tell me the usa's president using google search" \
			
 
				+           -c BrowsingAgent \
			
 
				+           -m gpt-4o-2024-05-13
			
 
				+```
			
--- a/agenthub/browsing_agent/__init__.py
+++ b/agenthub/browsing_agent/__init__.py
@@ -0,0 +1,5 @@
 
				+from opendevin.controller.agent import Agent
			
 
				+
			
 
				+from .browsing_agent import BrowsingAgent
			
 
				+
			
 
				+Agent.register('BrowsingAgent', BrowsingAgent)
			
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@@ -0,0 +1,155 @@
 
				+import ast
			
 
				+
			
 
				+from browsergym.core.action.highlevel import HighLevelActionSet
			
 
				+from browsergym.utils.obs import flatten_axtree_to_str
			
 
				+
			
 
				+from opendevin.controller.agent import Agent
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.events.action import (
			
 
				+    Action,
			
 
				+    BrowseInteractiveAction,
			
 
				+    MessageAction,
			
 
				+)
			
 
				+from opendevin.events.observation import BrowserOutputObservation
			
 
				+from opendevin.llm.llm import LLM
			
 
				+from opendevin.runtime.plugins import (
			
 
				+    PluginRequirement,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def parse_response(response: str) -> Action:
			
 
				+    thought = response.split('```')[0].strip()
			
 
				+    action_str = response.split('```')[1].strip()
			
 
				+    if 'send_msg_to_user(' in action_str:
			
 
				+        tree = ast.parse(action_str)
			
 
				+        args = tree.body[0].value.args  # type: ignore
			
 
				+        return MessageAction(args[0].value)
			
 
				+
			
 
				+    return BrowseInteractiveAction(browser_actions=action_str, thought=thought)
			
 
				+
			
 
				+
			
 
				+class BrowsingAgent(Agent):
			
 
				+    VERSION = '1.0'
			
 
				+    """
			
 
				+    An agent that interacts with the browser.
			
 
				+    """
			
 
				+
			
 
				+    sandbox_plugins: list[PluginRequirement] = []
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        llm: LLM,
			
 
				+    ) -> None:
			
 
				+        """
			
 
				+        Initializes a new instance of the BrowsingAgent class.
			
 
				+
			
 
				+        Parameters:
			
 
				+        - llm (LLM): The llm to be used by this agent
			
 
				+        """
			
 
				+        super().__init__(llm)
			
 
				+        self.action_space = HighLevelActionSet(
			
 
				+            # see https://github.com/ServiceNow/BrowserGym/blob/main/core/src/browsergym/core/action/highlevel.py for more details
			
 
				+            subsets=[
			
 
				+                'chat',
			
 
				+                'bid',
			
 
				+                'nav',
			
 
				+            ],  # define a configurable action space, with chat functionality, web navigation, and webpage grounding using accessibility tree and HTML.
			
 
				+            strict=False,  # less strict on the parsing of the actions
			
 
				+            multiaction=True,  # enable to agent to take multiple actions at once
			
 
				+        )
			
 
				+
			
 
				+        self.reset()
			
 
				+
			
 
				+    def reset(self) -> None:
			
 
				+        """
			
 
				+        Resets the Browsing Agent.
			
 
				+        """
			
 
				+        super().reset()
			
 
				+        self.cost_accumulator = 0
			
 
				+
			
 
				+    def step(self, state: State) -> Action:
			
 
				+        """
			
 
				+        Performs one step using the Browsing Agent.
			
 
				+        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
			
 
				+
			
 
				+        Parameters:
			
 
				+        - state (State): used to get updated info
			
 
				+
			
 
				+        Returns:
			
 
				+        - BrowseInteractiveAction(browsergym_command) - BrowserGym commands to run
			
 
				+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
			
 
				+        - AgentFinishAction() - end the interaction
			
 
				+        """
			
 
				+        goal = state.get_current_user_intent()
			
 
				+        messages = []
			
 
				+
			
 
				+        system_msg = f"""\
			
 
				+# Instructions
			
 
				+Review the current state of the page and all other information to find the best
			
 
				+possible next action to accomplish your goal. Your answer will be interpreted
			
 
				+and executed by a program, make sure to follow the formatting instructions.
			
 
				+
			
 
				+# Goal:
			
 
				+{goal}
			
 
				+
			
 
				+# Action Space
			
 
				+{self.action_space.describe(with_long_description=False, with_examples=True)}
			
 
				+"""
			
 
				+        messages.append({'role': 'system', 'content': system_msg})
			
 
				+        prev_actions = ''
			
 
				+        cur_axtree_txt = ''
			
 
				+        error_prefix = ''
			
 
				+        last_obs = None
			
 
				+        for prev_action, obs in state.history:
			
 
				+            if isinstance(prev_action, BrowseInteractiveAction):
			
 
				+                prev_actions += f'{prev_action.browser_actions}\n'
			
 
				+                last_obs = obs
			
 
				+
			
 
				+        if isinstance(last_obs, BrowserOutputObservation):
			
 
				+            if last_obs.error:
			
 
				+                # add error recovery prompt prefix
			
 
				+                error_prefix = f'Last action failed:\n{last_obs.last_browser_action}\nTry again with the current state of the page.\n'
			
 
				+            cur_axtree_txt = flatten_axtree_to_str(last_obs.axtree_object)
			
 
				+
			
 
				+        prompt = f"""\
			
 
				+{error_prefix}
			
 
				+
			
 
				+# Current Accessibility Tree:
			
 
				+{cur_axtree_txt}
			
 
				+
			
 
				+# Previous Actions
			
 
				+{prev_actions}
			
 
				+
			
 
				+Here is an example with chain of thought of a valid action when clicking on a button:
			
 
				+"
			
 
				+In order to accomplish my goal I need to click on the button with bid 12
			
 
				+```click("12")```
			
 
				+"
			
 
				+"""
			
 
				+        messages.append({'role': 'user', 'content': prompt})
			
 
				+        response = self.llm.completion(
			
 
				+            messages=messages,
			
 
				+            temperature=0.0,
			
 
				+        )
			
 
				+        self.log_cost(response)
			
 
				+        action_resp = response['choices'][0]['message']['content']
			
 
				+        logger.info(prompt)
			
 
				+        logger.info(action_resp)
			
 
				+        return parse_response(action_resp)
			
 
				+
			
 
				+    def search_memory(self, query: str) -> list[str]:
			
 
				+        raise NotImplementedError('Implement this abstract method')
			
 
				+
			
 
				+    def log_cost(self, response):
			
 
				+        # TODO: refactor to unified cost tracking
			
 
				+        try:
			
 
				+            cur_cost = self.llm.completion_cost(response)
			
 
				+        except Exception:
			
 
				+            cur_cost = 0
			
 
				+        self.cost_accumulator += cur_cost
			
 
				+        logger.info(
			
 
				+            'Cost: %.2f USD | Accumulated Cost: %.2f USD',
			
 
				+            cur_cost,
			
 
				+            self.cost_accumulator,
			
 
				+        )
			
--- a/agenthub/browsing_agent/prompt.py
+++ b/agenthub/browsing_agent/prompt.py
@@ -0,0 +1,785 @@
 
				+import abc
			
 
				+import difflib
			
 
				+import logging
			
 
				+import platform
			
 
				+from copy import deepcopy
			
 
				+from dataclasses import asdict, dataclass
			
 
				+from textwrap import dedent
			
 
				+from typing import Literal, Union
			
 
				+from warnings import warn
			
 
				+
			
 
				+from browsergym.core.action.base import AbstractActionSet
			
 
				+from browsergym.core.action.highlevel import HighLevelActionSet
			
 
				+from browsergym.core.action.python import PythonActionSet
			
 
				+
			
 
				+from opendevin.runtime.browser.browser_env import BrowserEnv
			
 
				+
			
 
				+from .utils import (
			
 
				+    ParseError,
			
 
				+    parse_html_tags_raise,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Flags:
			
 
				+    use_html: bool = True
			
 
				+    use_ax_tree: bool = False
			
 
				+    drop_ax_tree_first: bool = True  # This flag is no longer active TODO delete
			
 
				+    use_thinking: bool = False
			
 
				+    use_error_logs: bool = False
			
 
				+    use_past_error_logs: bool = False
			
 
				+    use_history: bool = False
			
 
				+    use_action_history: bool = False
			
 
				+    use_memory: bool = False
			
 
				+    use_diff: bool = False
			
 
				+    html_type: str = 'pruned_html'
			
 
				+    use_concrete_example: bool = True
			
 
				+    use_abstract_example: bool = False
			
 
				+    multi_actions: bool = False
			
 
				+    action_space: Literal[
			
 
				+        'python', 'bid', 'coord', 'bid+coord', 'bid+nav', 'coord+nav', 'bid+coord+nav'
			
 
				+    ] = 'bid'
			
 
				+    is_strict: bool = False
			
 
				+    # This flag will be automatically disabled `if not chat_model_args.has_vision()`
			
 
				+    use_screenshot: bool = True
			
 
				+    enable_chat: bool = False
			
 
				+    max_prompt_tokens: int = 100_000
			
 
				+    extract_visible_tag: bool = False
			
 
				+    extract_coords: Literal['False', 'center', 'box'] = 'False'
			
 
				+    extract_visible_elements_only: bool = False
			
 
				+    demo_mode: Literal['off', 'default', 'only_visible_elements'] = 'off'
			
 
				+
			
 
				+    def copy(self):
			
 
				+        return deepcopy(self)
			
 
				+
			
 
				+    def asdict(self):
			
 
				+        """Helper for JSON serializble requirement."""
			
 
				+        return asdict(self)
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_dict(self, flags_dict):
			
 
				+        """Helper for JSON serializble requirement."""
			
 
				+        if isinstance(flags_dict, Flags):
			
 
				+            return flags_dict
			
 
				+
			
 
				+        if not isinstance(flags_dict, dict):
			
 
				+            raise ValueError(
			
 
				+                f'Unregcognized type for flags_dict of type {type(flags_dict)}.'
			
 
				+            )
			
 
				+        return Flags(**flags_dict)
			
 
				+
			
 
				+
			
 
				+class PromptElement:
			
 
				+    """Base class for all prompt elements. Prompt elements can be hidden.
			
 
				+
			
 
				+    Prompt elements are used to build the prompt. Use flags to control which
			
 
				+    prompt elements are visible. We use class attributes as a convenient way
			
 
				+    to implement static prompts, but feel free to override them with instance
			
 
				+    attributes or @property decorator."""
			
 
				+
			
 
				+    _prompt = ''
			
 
				+    _abstract_ex = ''
			
 
				+    _concrete_ex = ''
			
 
				+
			
 
				+    def __init__(self, visible: bool = True) -> None:
			
 
				+        """Prompt element that can be hidden.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        visible : bool, optional
			
 
				+            Whether the prompt element should be visible, by default True. Can
			
 
				+            be a callable that returns a bool. This is useful when a specific
			
 
				+            flag changes during a shrink iteration.
			
 
				+        """
			
 
				+        self._visible = visible
			
 
				+
			
 
				+    @property
			
 
				+    def prompt(self):
			
 
				+        """Avoid overriding this method. Override _prompt instead."""
			
 
				+        return self._hide(self._prompt)
			
 
				+
			
 
				+    @property
			
 
				+    def abstract_ex(self):
			
 
				+        """Useful when this prompt element is requesting an answer from the llm.
			
 
				+        Provide an abstract example of the answer here. See Memory for an
			
 
				+        example.
			
 
				+
			
 
				+        Avoid overriding this method. Override _abstract_ex instead
			
 
				+        """
			
 
				+        return self._hide(self._abstract_ex)
			
 
				+
			
 
				+    @property
			
 
				+    def concrete_ex(self):
			
 
				+        """Useful when this prompt element is requesting an answer from the llm.
			
 
				+        Provide a concrete example of the answer here. See Memory for an
			
 
				+        example.
			
 
				+
			
 
				+        Avoid overriding this method. Override _concrete_ex instead
			
 
				+        """
			
 
				+        return self._hide(self._concrete_ex)
			
 
				+
			
 
				+    @property
			
 
				+    def is_visible(self):
			
 
				+        """Handle the case where visible is a callable."""
			
 
				+        visible = self._visible
			
 
				+        if callable(visible):
			
 
				+            visible = visible()
			
 
				+        return visible
			
 
				+
			
 
				+    def _hide(self, value):
			
 
				+        """Return value if visible is True, else return empty string."""
			
 
				+        if self.is_visible:
			
 
				+            return value
			
 
				+        else:
			
 
				+            return ''
			
 
				+
			
 
				+    def _parse_answer(self, text_answer) -> dict:
			
 
				+        if self.is_visible:
			
 
				+            return self._parse_answer(text_answer)
			
 
				+        else:
			
 
				+            return {}
			
 
				+
			
 
				+
			
 
				+class Shrinkable(PromptElement, abc.ABC):
			
 
				+    @abc.abstractmethod
			
 
				+    def shrink(self) -> None:
			
 
				+        """Implement shrinking of this prompt element.
			
 
				+
			
 
				+        You need to recursively call all shrinkable elements that are part of
			
 
				+        this prompt. You can also implement a shriking startegy for this prompt.
			
 
				+        Shrinking is can be called multiple times to progressively shrink the
			
 
				+        prompt until it fits max_tokens. Default max shrink iterations is 20.
			
 
				+        """
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+class Truncater(Shrinkable):
			
 
				+    """A prompt element that can be truncated to fit the context length of the LLM.
			
 
				+    Of course, it will be great that we never have to use the functionality here to `shrink()` the prompt.
			
 
				+    Extend this class for prompt elements that can be truncated. Usually long observations such as AxTree or HTML.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, visible, shrink_speed=0.3, start_truncate_iteration=10):
			
 
				+        super().__init__(visible=visible)
			
 
				+        self.shrink_speed = shrink_speed  # the percentage shrinked in each iteration
			
 
				+        self.start_truncate_iteration = (
			
 
				+            start_truncate_iteration  # the iteration to start truncating
			
 
				+        )
			
 
				+        self.shrink_calls = 0
			
 
				+        self.deleted_lines = 0
			
 
				+
			
 
				+    def shrink(self) -> None:
			
 
				+        if self.is_visible and self.shrink_calls >= self.start_truncate_iteration:
			
 
				+            # remove the fraction of _prompt
			
 
				+            lines = self._prompt.splitlines()
			
 
				+            new_line_count = int(len(lines) * (1 - self.shrink_speed))
			
 
				+            self.deleted_lines += len(lines) - new_line_count
			
 
				+            self._prompt = '\n'.join(lines[:new_line_count])
			
 
				+            self._prompt += (
			
 
				+                f'\n... Deleted {self.deleted_lines} lines to reduce prompt size.'
			
 
				+            )
			
 
				+
			
 
				+        self.shrink_calls += 1
			
 
				+
			
 
				+
			
 
				+def fit_tokens(
			
 
				+    shrinkable: Shrinkable,
			
 
				+    max_prompt_chars=None,
			
 
				+    max_iterations=20,
			
 
				+):
			
 
				+    """Shrink a prompt element until it fits max_tokens.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    shrinkable : Shrinkable
			
 
				+        The prompt element to shrink.
			
 
				+    max_prompt_chars : int
			
 
				+        The maximum number of chars allowed.
			
 
				+    max_iterations : int, optional
			
 
				+        The maximum number of shrink iterations, by default 20.
			
 
				+    model_name : str, optional
			
 
				+        The name of the model used when tokenizing.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    str : the prompt after shrinking.
			
 
				+    """
			
 
				+
			
 
				+    if max_prompt_chars is None:
			
 
				+        return shrinkable.prompt
			
 
				+
			
 
				+    for _ in range(max_iterations):
			
 
				+        prompt = shrinkable.prompt
			
 
				+        if isinstance(prompt, str):
			
 
				+            prompt_str = prompt
			
 
				+        elif isinstance(prompt, list):
			
 
				+            prompt_str = '\n'.join([p['text'] for p in prompt if p['type'] == 'text'])
			
 
				+        else:
			
 
				+            raise ValueError(f'Unrecognized type for prompt: {type(prompt)}')
			
 
				+        n_chars = len(prompt_str)
			
 
				+        if n_chars <= max_prompt_chars:
			
 
				+            return prompt
			
 
				+        shrinkable.shrink()
			
 
				+
			
 
				+    logging.info(
			
 
				+        dedent(
			
 
				+            f"""\
			
 
				+            After {max_iterations} shrink iterations, the prompt is still
			
 
				+            {len(prompt_str)} chars (greater than {max_prompt_chars}). Returning the prompt as is."""
			
 
				+        )
			
 
				+    )
			
 
				+    return prompt
			
 
				+
			
 
				+
			
 
				+class HTML(Truncater):
			
 
				+    def __init__(self, html, visible: bool = True, prefix='') -> None:
			
 
				+        super().__init__(visible=visible, start_truncate_iteration=5)
			
 
				+        self._prompt = f'\n{prefix}HTML:\n{html}\n'
			
 
				+
			
 
				+
			
 
				+class AXTree(Truncater):
			
 
				+    def __init__(
			
 
				+        self, ax_tree, visible: bool = True, coord_type=None, prefix=''
			
 
				+    ) -> None:
			
 
				+        super().__init__(visible=visible, start_truncate_iteration=10)
			
 
				+        if coord_type == 'center':
			
 
				+            coord_note = """\
			
 
				+Note: center coordinates are provided in parenthesis and are
			
 
				+  relative to the top left corner of the page.\n\n"""
			
 
				+        elif coord_type == 'box':
			
 
				+            coord_note = """\
			
 
				+Note: bounding box of each object are provided in parenthesis and are
			
 
				+  relative to the top left corner of the page.\n\n"""
			
 
				+        else:
			
 
				+            coord_note = ''
			
 
				+        self._prompt = f'\n{prefix}AXTree:\n{coord_note}{ax_tree}\n'
			
 
				+
			
 
				+
			
 
				+class Error(PromptElement):
			
 
				+    def __init__(self, error, visible: bool = True, prefix='') -> None:
			
 
				+        super().__init__(visible=visible)
			
 
				+        self._prompt = f'\n{prefix}Error from previous action:\n{error}\n'
			
 
				+
			
 
				+
			
 
				+class Observation(Shrinkable):
			
 
				+    """Observation of the current step.
			
 
				+
			
 
				+    Contains the html, the accessibility tree and the error logs.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, obs, flags: Flags) -> None:
			
 
				+        super().__init__()
			
 
				+        self.flags = flags
			
 
				+        self.obs = obs
			
 
				+        self.html = HTML(obs[flags.html_type], visible=flags.use_html, prefix='## ')
			
 
				+        self.ax_tree = AXTree(
			
 
				+            obs['axtree_txt'],
			
 
				+            visible=flags.use_ax_tree,
			
 
				+            coord_type=flags.extract_coords,
			
 
				+            prefix='## ',
			
 
				+        )
			
 
				+        self.error = Error(
			
 
				+            obs['last_action_error'],
			
 
				+            visible=flags.use_error_logs and obs['last_action_error'],
			
 
				+            prefix='## ',
			
 
				+        )
			
 
				+
			
 
				+    def shrink(self):
			
 
				+        self.ax_tree.shrink()
			
 
				+        self.html.shrink()
			
 
				+
			
 
				+    @property
			
 
				+    def _prompt(self) -> str:  # type: ignore
			
 
				+        return f'\n# Observation of current step:\n{self.html.prompt}{self.ax_tree.prompt}{self.error.prompt}\n\n'
			
 
				+
			
 
				+    def add_screenshot(self, prompt):
			
 
				+        if self.flags.use_screenshot:
			
 
				+            if isinstance(prompt, str):
			
 
				+                prompt = [{'type': 'text', 'text': prompt}]
			
 
				+            img_url = BrowserEnv.image_to_jpg_base64_url(
			
 
				+                self.obs['screenshot'], add_data_prefix=True
			
 
				+            )
			
 
				+            prompt.append({'type': 'image_url', 'image_url': img_url})
			
 
				+
			
 
				+        return prompt
			
 
				+
			
 
				+
			
 
				+class MacNote(PromptElement):
			
 
				+    def __init__(self) -> None:
			
 
				+        super().__init__(visible=platform.system() == 'Darwin')
			
 
				+        self._prompt = '\nNote: you are on mac so you should use Meta instead of Control for Control+C etc.\n'
			
 
				+
			
 
				+
			
 
				+class BeCautious(PromptElement):
			
 
				+    def __init__(self, visible: bool = True) -> None:
			
 
				+        super().__init__(visible=visible)
			
 
				+        self._prompt = """\
			
 
				+\nBe very cautious. Avoid submitting anything before verifying the effect of your
			
 
				+actions. Take the time to explore the effect of safe actions first. For example
			
 
				+you can fill a few elements of a form, but don't click submit before verifying
			
 
				+that everything was filled correctly.\n"""
			
 
				+
			
 
				+
			
 
				+class GoalInstructions(PromptElement):
			
 
				+    def __init__(self, goal, visible: bool = True) -> None:
			
 
				+        super().__init__(visible)
			
 
				+        self._prompt = f"""\
			
 
				+# Instructions
			
 
				+Review the current state of the page and all other information to find the best
			
 
				+possible next action to accomplish your goal. Your answer will be interpreted
			
 
				+and executed by a program, make sure to follow the formatting instructions.
			
 
				+
			
 
				+## Goal:
			
 
				+{goal}
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+class ChatInstructions(PromptElement):
			
 
				+    def __init__(self, chat_messages, visible: bool = True) -> None:
			
 
				+        super().__init__(visible)
			
 
				+        self._prompt = """\
			
 
				+# Instructions
			
 
				+
			
 
				+You are a UI Assistant, your goal is to help the user perform tasks using a web browser. You can
			
 
				+communicate with the user via a chat, in which the user gives you instructions and in which you
			
 
				+can send back messages. You have access to a web browser that both you and the user can see,
			
 
				+and with which only you can interact via specific commands.
			
 
				+
			
 
				+Review the instructions from the user, the current state of the page and all other information
			
 
				+to find the best possible next action to accomplish your goal. Your answer will be interpreted
			
 
				+and executed by a program, make sure to follow the formatting instructions.
			
 
				+
			
 
				+## Chat messages:
			
 
				+
			
 
				+"""
			
 
				+        self._prompt += '\n'.join(
			
 
				+            [
			
 
				+                f"""\
			
 
				+ - [{msg['role']}] {msg['message']}"""
			
 
				+                for msg in chat_messages
			
 
				+            ]
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+class SystemPrompt(PromptElement):
			
 
				+    _prompt = """\
			
 
				+You are an agent trying to solve a web task based on the content of the page and
			
 
				+a user instructions. You can interact with the page and explore. Each time you
			
 
				+submit an action it will be sent to the browser and you will receive a new page."""
			
 
				+
			
 
				+
			
 
				+class MainPrompt(Shrinkable):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        obs_history,
			
 
				+        actions,
			
 
				+        memories,
			
 
				+        thoughts,
			
 
				+        flags: Flags,
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.flags = flags
			
 
				+        self.history = History(obs_history, actions, memories, thoughts, flags)
			
 
				+        if self.flags.enable_chat:
			
 
				+            self.instructions: Union[ChatInstructions, GoalInstructions] = (
			
 
				+                ChatInstructions(obs_history[-1]['chat_messages'])
			
 
				+            )
			
 
				+        else:
			
 
				+            if (
			
 
				+                'chat_messages' in obs_history[-1]
			
 
				+                and sum(
			
 
				+                    [msg['role'] == 'user' for msg in obs_history[-1]['chat_messages']]
			
 
				+                )
			
 
				+                > 1
			
 
				+            ):
			
 
				+                logging.warning(
			
 
				+                    'Agent is in goal mode, but multiple user messages are present in the chat. Consider switching to `enable_chat=True`.'
			
 
				+                )
			
 
				+            self.instructions = GoalInstructions(obs_history[-1]['goal'])
			
 
				+
			
 
				+        self.obs = Observation(obs_history[-1], self.flags)
			
 
				+        self.action_space = ActionSpace(self.flags)
			
 
				+
			
 
				+        self.think = Think(visible=flags.use_thinking)
			
 
				+        self.memory = Memory(visible=flags.use_memory)
			
 
				+
			
 
				+    @property
			
 
				+    def _prompt(self) -> str:  # type: ignore
			
 
				+        prompt = f"""\
			
 
				+{self.instructions.prompt}\
			
 
				+{self.obs.prompt}\
			
 
				+{self.history.prompt}\
			
 
				+{self.action_space.prompt}\
			
 
				+{self.think.prompt}\
			
 
				+{self.memory.prompt}\
			
 
				+"""
			
 
				+
			
 
				+        if self.flags.use_abstract_example:
			
 
				+            prompt += f"""
			
 
				+# Abstract Example
			
 
				+
			
 
				+Here is an abstract version of the answer with description of the content of
			
 
				+each tag. Make sure you follow this structure, but replace the content with your
			
 
				+answer:
			
 
				+{self.think.abstract_ex}\
			
 
				+{self.memory.abstract_ex}\
			
 
				+{self.action_space.abstract_ex}\
			
 
				+"""
			
 
				+
			
 
				+        if self.flags.use_concrete_example:
			
 
				+            prompt += f"""
			
 
				+# Concrete Example
			
 
				+
			
 
				+Here is a concrete example of how to format your answer.
			
 
				+Make sure to follow the template with proper tags:
			
 
				+{self.think.concrete_ex}\
			
 
				+{self.memory.concrete_ex}\
			
 
				+{self.action_space.concrete_ex}\
			
 
				+"""
			
 
				+        return self.obs.add_screenshot(prompt)
			
 
				+
			
 
				+    def shrink(self):
			
 
				+        self.history.shrink()
			
 
				+        self.obs.shrink()
			
 
				+
			
 
				+    def _parse_answer(self, text_answer):
			
 
				+        ans_dict = {}
			
 
				+        ans_dict.update(self.think._parse_answer(text_answer))
			
 
				+        ans_dict.update(self.memory._parse_answer(text_answer))
			
 
				+        ans_dict.update(self.action_space._parse_answer(text_answer))
			
 
				+        return ans_dict
			
 
				+
			
 
				+
			
 
				+class ActionSpace(PromptElement):
			
 
				+    def __init__(self, flags: Flags) -> None:
			
 
				+        super().__init__()
			
 
				+        self.flags = flags
			
 
				+        self.action_space = _get_action_space(flags)
			
 
				+
			
 
				+        self._prompt = (
			
 
				+            f'# Action space:\n{self.action_space.describe()}{MacNote().prompt}\n'
			
 
				+        )
			
 
				+        self._abstract_ex = f"""
			
 
				+<action>
			
 
				+{self.action_space.example_action(abstract=True)}
			
 
				+</action>
			
 
				+"""
			
 
				+        self._concrete_ex = f"""
			
 
				+<action>
			
 
				+{self.action_space.example_action(abstract=False)}
			
 
				+</action>
			
 
				+"""
			
 
				+
			
 
				+    def _parse_answer(self, text_answer):
			
 
				+        ans_dict = parse_html_tags_raise(
			
 
				+            text_answer, keys=['action'], merge_multiple=True
			
 
				+        )
			
 
				+
			
 
				+        try:
			
 
				+            # just check if action can be mapped to python code but keep action as is
			
 
				+            # the environment will be responsible for mapping it to python
			
 
				+            self.action_space.to_python_code(ans_dict['action'])
			
 
				+        except Exception as e:
			
 
				+            raise ParseError(
			
 
				+                f'Error while parsing action\n: {e}\n'
			
 
				+                'Make sure your answer is restricted to the allowed actions.'
			
 
				+            )
			
 
				+
			
 
				+        return ans_dict
			
 
				+
			
 
				+
			
 
				+def _get_action_space(flags: Flags) -> AbstractActionSet:
			
 
				+    match flags.action_space:
			
 
				+        case 'python':
			
 
				+            action_space = PythonActionSet(strict=flags.is_strict)
			
 
				+            if flags.multi_actions:
			
 
				+                warn(
			
 
				+                    f'Flag action_space={repr(flags.action_space)} incompatible with multi_actions={repr(flags.multi_actions)}.'
			
 
				+                )
			
 
				+            if flags.demo_mode != 'off':
			
 
				+                warn(
			
 
				+                    f'Flag action_space={repr(flags.action_space)} incompatible with demo_mode={repr(flags.demo_mode)}.'
			
 
				+                )
			
 
				+            return action_space
			
 
				+        case 'bid':
			
 
				+            action_subsets = ['chat', 'bid']
			
 
				+        case 'coord':
			
 
				+            action_subsets = ['chat', 'coord']
			
 
				+        case 'bid+coord':
			
 
				+            action_subsets = ['chat', 'bid', 'coord']
			
 
				+        case 'bid+nav':
			
 
				+            action_subsets = ['chat', 'bid', 'nav']
			
 
				+        case 'coord+nav':
			
 
				+            action_subsets = ['chat', 'coord', 'nav']
			
 
				+        case 'bid+coord+nav':
			
 
				+            action_subsets = ['chat', 'bid', 'coord', 'nav']
			
 
				+        case _:
			
 
				+            raise NotImplementedError(
			
 
				+                f'Unknown action_space {repr(flags.action_space)}'
			
 
				+            )
			
 
				+
			
 
				+    action_space = HighLevelActionSet(
			
 
				+        subsets=action_subsets,
			
 
				+        multiaction=flags.multi_actions,
			
 
				+        strict=flags.is_strict,
			
 
				+        demo_mode=flags.demo_mode,
			
 
				+    )
			
 
				+
			
 
				+    return action_space
			
 
				+
			
 
				+
			
 
				+class Memory(PromptElement):
			
 
				+    _prompt = ''  # provided in the abstract and concrete examples
			
 
				+
			
 
				+    _abstract_ex = """
			
 
				+<memory>
			
 
				+Write down anything you need to remember for next steps. You will be presented
			
 
				+with the list of previous memories and past actions.
			
 
				+</memory>
			
 
				+"""
			
 
				+
			
 
				+    _concrete_ex = """
			
 
				+<memory>
			
 
				+I clicked on bid 32 to activate tab 2. The accessibility tree should mention
			
 
				+focusable for elements of the form at next step.
			
 
				+</memory>
			
 
				+"""
			
 
				+
			
 
				+    def _parse_answer(self, text_answer):
			
 
				+        return parse_html_tags_raise(
			
 
				+            text_answer, optional_keys=['memory'], merge_multiple=True
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+class Think(PromptElement):
			
 
				+    _prompt = ''
			
 
				+
			
 
				+    _abstract_ex = """
			
 
				+<think>
			
 
				+Think step by step. If you need to make calculations such as coordinates, write them here. Describe the effect
			
 
				+that your previous action had on the current content of the page.
			
 
				+</think>
			
 
				+"""
			
 
				+    _concrete_ex = """
			
 
				+<think>
			
 
				+My memory says that I filled the first name and last name, but I can't see any
			
 
				+content in the form. I need to explore different ways to fill the form. Perhaps
			
 
				+the form is not visible yet or some fields are disabled. I need to replan.
			
 
				+</think>
			
 
				+"""
			
 
				+
			
 
				+    def _parse_answer(self, text_answer):
			
 
				+        return parse_html_tags_raise(
			
 
				+            text_answer, optional_keys=['think'], merge_multiple=True
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def diff(previous, new):
			
 
				+    """Return a string showing the difference between original and new.
			
 
				+
			
 
				+    If the difference is above diff_threshold, return the diff string."""
			
 
				+
			
 
				+    if previous == new:
			
 
				+        return 'Identical', []
			
 
				+
			
 
				+    if len(previous) == 0 or previous is None:
			
 
				+        return 'previous is empty', []
			
 
				+
			
 
				+    diff_gen = difflib.ndiff(previous.splitlines(), new.splitlines())
			
 
				+
			
 
				+    diff_lines = []
			
 
				+    plus_count = 0
			
 
				+    minus_count = 0
			
 
				+    for line in diff_gen:
			
 
				+        if line.strip().startswith('+'):
			
 
				+            diff_lines.append(line)
			
 
				+            plus_count += 1
			
 
				+        elif line.strip().startswith('-'):
			
 
				+            diff_lines.append(line)
			
 
				+            minus_count += 1
			
 
				+        else:
			
 
				+            continue
			
 
				+
			
 
				+    header = f'{plus_count} lines added and {minus_count} lines removed:'
			
 
				+
			
 
				+    return header, diff_lines
			
 
				+
			
 
				+
			
 
				+class Diff(Shrinkable):
			
 
				+    def __init__(
			
 
				+        self, previous, new, prefix='', max_line_diff=20, shrink_speed=2, visible=True
			
 
				+    ) -> None:
			
 
				+        super().__init__(visible=visible)
			
 
				+        self.max_line_diff = max_line_diff
			
 
				+        self.header, self.diff_lines = diff(previous, new)
			
 
				+        self.shrink_speed = shrink_speed
			
 
				+        self.prefix = prefix
			
 
				+
			
 
				+    def shrink(self):
			
 
				+        self.max_line_diff -= self.shrink_speed
			
 
				+        self.max_line_diff = max(1, self.max_line_diff)
			
 
				+
			
 
				+    @property
			
 
				+    def _prompt(self) -> str:  # type: ignore
			
 
				+        diff_str = '\n'.join(self.diff_lines[: self.max_line_diff])
			
 
				+        if len(self.diff_lines) > self.max_line_diff:
			
 
				+            original_count = len(self.diff_lines)
			
 
				+            diff_str = f'{diff_str}\nDiff truncated, {original_count - self.max_line_diff} changes now shown.'
			
 
				+        return f'{self.prefix}{self.header}\n{diff_str}\n'
			
 
				+
			
 
				+
			
 
				+class HistoryStep(Shrinkable):
			
 
				+    def __init__(
			
 
				+        self, previous_obs, current_obs, action, memory, flags: Flags, shrink_speed=1
			
 
				+    ) -> None:
			
 
				+        super().__init__()
			
 
				+        self.html_diff = Diff(
			
 
				+            previous_obs[flags.html_type],
			
 
				+            current_obs[flags.html_type],
			
 
				+            prefix='\n### HTML diff:\n',
			
 
				+            shrink_speed=shrink_speed,
			
 
				+            visible=lambda: flags.use_html and flags.use_diff,
			
 
				+        )
			
 
				+        self.ax_tree_diff = Diff(
			
 
				+            previous_obs['axtree_txt'],
			
 
				+            current_obs['axtree_txt'],
			
 
				+            prefix='\n### Accessibility tree diff:\n',
			
 
				+            shrink_speed=shrink_speed,
			
 
				+            visible=lambda: flags.use_ax_tree and flags.use_diff,
			
 
				+        )
			
 
				+        self.error = Error(
			
 
				+            current_obs['last_action_error'],
			
 
				+            visible=(
			
 
				+                flags.use_error_logs
			
 
				+                and current_obs['last_action_error']
			
 
				+                and flags.use_past_error_logs
			
 
				+            ),
			
 
				+            prefix='### ',
			
 
				+        )
			
 
				+        self.shrink_speed = shrink_speed
			
 
				+        self.action = action
			
 
				+        self.memory = memory
			
 
				+        self.flags = flags
			
 
				+
			
 
				+    def shrink(self):
			
 
				+        super().shrink()
			
 
				+        self.html_diff.shrink()
			
 
				+        self.ax_tree_diff.shrink()
			
 
				+
			
 
				+    @property
			
 
				+    def _prompt(self) -> str:  # type: ignore
			
 
				+        prompt = ''
			
 
				+
			
 
				+        if self.flags.use_action_history:
			
 
				+            prompt += f'\n### Action:\n{self.action}\n'
			
 
				+
			
 
				+        prompt += (
			
 
				+            f'{self.error.prompt}{self.html_diff.prompt}{self.ax_tree_diff.prompt}'
			
 
				+        )
			
 
				+
			
 
				+        if self.flags.use_memory and self.memory is not None:
			
 
				+            prompt += f'\n### Memory:\n{self.memory}\n'
			
 
				+
			
 
				+        return prompt
			
 
				+
			
 
				+
			
 
				+class History(Shrinkable):
			
 
				+    def __init__(
			
 
				+        self, history_obs, actions, memories, thoughts, flags: Flags, shrink_speed=1
			
 
				+    ) -> None:
			
 
				+        super().__init__(visible=flags.use_history)
			
 
				+        assert len(history_obs) == len(actions) + 1
			
 
				+        assert len(history_obs) == len(memories) + 1
			
 
				+
			
 
				+        self.shrink_speed = shrink_speed
			
 
				+        self.history_steps: list[HistoryStep] = []
			
 
				+
			
 
				+        for i in range(1, len(history_obs)):
			
 
				+            self.history_steps.append(
			
 
				+                HistoryStep(
			
 
				+                    history_obs[i - 1],
			
 
				+                    history_obs[i],
			
 
				+                    actions[i - 1],
			
 
				+                    memories[i - 1],
			
 
				+                    flags,
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+    def shrink(self):
			
 
				+        """Shrink individual steps"""
			
 
				+        # TODO set the shrink speed of older steps to be higher
			
 
				+        super().shrink()
			
 
				+        for step in self.history_steps:
			
 
				+            step.shrink()
			
 
				+
			
 
				+    @property
			
 
				+    def _prompt(self):
			
 
				+        prompts = ['# History of interaction with the task:\n']
			
 
				+        for i, step in enumerate(self.history_steps):
			
 
				+            prompts.append(f'## step {i}')
			
 
				+            prompts.append(step.prompt)
			
 
				+        return '\n'.join(prompts) + '\n'
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    html_template = """
			
 
				+    <html>
			
 
				+    <body>
			
 
				+    <div>
			
 
				+    Hello World.
			
 
				+    Step {}.
			
 
				+    </div>
			
 
				+    </body>
			
 
				+    </html>
			
 
				+    """
			
 
				+
			
 
				+    OBS_HISTORY = [
			
 
				+        {
			
 
				+            'goal': 'do this and that',
			
 
				+            'pruned_html': html_template.format(1),
			
 
				+            'axtree_txt': '[1] Click me',
			
 
				+            'last_action_error': '',
			
 
				+        },
			
 
				+        {
			
 
				+            'goal': 'do this and that',
			
 
				+            'pruned_html': html_template.format(2),
			
 
				+            'axtree_txt': '[1] Click me',
			
 
				+            'last_action_error': '',
			
 
				+        },
			
 
				+        {
			
 
				+            'goal': 'do this and that',
			
 
				+            'pruned_html': html_template.format(3),
			
 
				+            'axtree_txt': '[1] Click me',
			
 
				+            'last_action_error': 'Hey, there is an error now',
			
 
				+        },
			
 
				+    ]
			
 
				+    ACTIONS = ["click('41')", "click('42')"]
			
 
				+    MEMORIES = ['memory A', 'memory B']
			
 
				+    THOUGHTS = ['thought A', 'thought B']
			
 
				+
			
 
				+    flags = Flags(
			
 
				+        use_html=True,
			
 
				+        use_ax_tree=True,
			
 
				+        use_thinking=True,
			
 
				+        use_error_logs=True,
			
 
				+        use_past_error_logs=True,
			
 
				+        use_history=True,
			
 
				+        use_action_history=True,
			
 
				+        use_memory=True,
			
 
				+        use_diff=True,
			
 
				+        html_type='pruned_html',
			
 
				+        use_concrete_example=True,
			
 
				+        use_abstract_example=True,
			
 
				+        use_screenshot=False,
			
 
				+        multi_actions=True,
			
 
				+    )
			
 
				+
			
 
				+    print(
			
 
				+        MainPrompt(
			
 
				+            obs_history=OBS_HISTORY,
			
 
				+            actions=ACTIONS,
			
 
				+            memories=MEMORIES,
			
 
				+            thoughts=THOUGHTS,
			
 
				+            flags=flags,
			
 
				+        ).prompt
			
 
				+    )
			
--- a/agenthub/browsing_agent/utils.py
+++ b/agenthub/browsing_agent/utils.py
@@ -0,0 +1,160 @@
 
				+import collections
			
 
				+import re
			
 
				+from warnings import warn
			
 
				+
			
 
				+import yaml
			
 
				+
			
 
				+
			
 
				+def yaml_parser(message):
			
 
				+    """Parse a yaml message for the retry function."""
			
 
				+
			
 
				+    # saves gpt-3.5 from some yaml parsing errors
			
 
				+    message = re.sub(r':\s*\n(?=\S|\n)', ': ', message)
			
 
				+
			
 
				+    try:
			
 
				+        value = yaml.safe_load(message)
			
 
				+        valid = True
			
 
				+        retry_message = ''
			
 
				+    except yaml.YAMLError as e:
			
 
				+        warn(str(e))
			
 
				+        value = {}
			
 
				+        valid = False
			
 
				+        retry_message = "Your response is not a valid yaml. Please try again and be careful to the format. Don't add any apology or comment, just the answer."
			
 
				+    return value, valid, retry_message
			
 
				+
			
 
				+
			
 
				+def _compress_chunks(text, identifier, skip_list, split_regex='\n\n+'):
			
 
				+    """Compress a string by replacing redundant chunks by identifiers. Chunks are defined by the split_regex."""
			
 
				+    text_list = re.split(split_regex, text)
			
 
				+    text_list = [chunk.strip() for chunk in text_list]
			
 
				+    counter = collections.Counter(text_list)
			
 
				+    def_dict = {}
			
 
				+    id = 0
			
 
				+
			
 
				+    # Store items that occur more than once in a dictionary
			
 
				+    for item, count in counter.items():
			
 
				+        if count > 1 and item not in skip_list and len(item) > 10:
			
 
				+            def_dict[f'{identifier}-{id}'] = item
			
 
				+            id += 1
			
 
				+
			
 
				+    # Replace redundant items with their identifiers in the text
			
 
				+    compressed_text = '\n'.join(text_list)
			
 
				+    for key, value in def_dict.items():
			
 
				+        compressed_text = compressed_text.replace(value, key)
			
 
				+
			
 
				+    return def_dict, compressed_text
			
 
				+
			
 
				+
			
 
				+def compress_string(text):
			
 
				+    """Compress a string by replacing redundant paragraphs and lines with identifiers."""
			
 
				+
			
 
				+    # Perform paragraph-level compression
			
 
				+    def_dict, compressed_text = _compress_chunks(
			
 
				+        text, identifier='§', skip_list=[], split_regex='\n\n+'
			
 
				+    )
			
 
				+
			
 
				+    # Perform line-level compression, skipping any paragraph identifiers
			
 
				+    line_dict, compressed_text = _compress_chunks(
			
 
				+        compressed_text, '¶', list(def_dict.keys()), split_regex='\n+'
			
 
				+    )
			
 
				+    def_dict.update(line_dict)
			
 
				+
			
 
				+    # Create a definitions section
			
 
				+    def_lines = ['<definitions>']
			
 
				+    for key, value in def_dict.items():
			
 
				+        def_lines.append(f'{key}:\n{value}')
			
 
				+    def_lines.append('</definitions>')
			
 
				+    definitions = '\n'.join(def_lines)
			
 
				+
			
 
				+    return definitions + '\n' + compressed_text
			
 
				+
			
 
				+
			
 
				+def extract_html_tags(text, keys):
			
 
				+    """Extract the content within HTML tags for a list of keys.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    text : str
			
 
				+        The input string containing the HTML tags.
			
 
				+    keys : list of str
			
 
				+        The HTML tags to extract the content from.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    dict
			
 
				+        A dictionary mapping each key to a list of subset in `text` that match the key.
			
 
				+
			
 
				+    Notes
			
 
				+    -----
			
 
				+    All text and keys will be converted to lowercase before matching.
			
 
				+
			
 
				+    """
			
 
				+    content_dict = {}
			
 
				+    # text = text.lower()
			
 
				+    # keys = set([k.lower() for k in keys])
			
 
				+    for key in keys:
			
 
				+        pattern = f'<{key}>(.*?)</{key}>'
			
 
				+        matches = re.findall(pattern, text, re.DOTALL)
			
 
				+        if matches:
			
 
				+            content_dict[key] = [match.strip() for match in matches]
			
 
				+    return content_dict
			
 
				+
			
 
				+
			
 
				+class ParseError(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+def parse_html_tags_raise(text, keys=(), optional_keys=(), merge_multiple=False):
			
 
				+    """A version of parse_html_tags that raises an exception if the parsing is not successful."""
			
 
				+    content_dict, valid, retry_message = parse_html_tags(
			
 
				+        text, keys, optional_keys, merge_multiple=merge_multiple
			
 
				+    )
			
 
				+    if not valid:
			
 
				+        raise ParseError(retry_message)
			
 
				+    return content_dict
			
 
				+
			
 
				+
			
 
				+def parse_html_tags(text, keys=(), optional_keys=(), merge_multiple=False):
			
 
				+    """Satisfy the parse api, extracts 1 match per key and validates that all keys are present
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    text : str
			
 
				+        The input string containing the HTML tags.
			
 
				+    keys : list of str
			
 
				+        The HTML tags to extract the content from.
			
 
				+    optional_keys : list of str
			
 
				+        The HTML tags to extract the content from, but are optional.
			
 
				+
			
 
				+    Returns
			
 
				+    -------
			
 
				+    dict
			
 
				+        A dictionary mapping each key to subset of `text` that match the key.
			
 
				+    bool
			
 
				+        Whether the parsing was successful.
			
 
				+    str
			
 
				+        A message to be displayed to the agent if the parsing was not successful.
			
 
				+    """
			
 
				+    all_keys = tuple(keys) + tuple(optional_keys)
			
 
				+    content_dict = extract_html_tags(text, all_keys)
			
 
				+    retry_messages = []
			
 
				+
			
 
				+    for key in all_keys:
			
 
				+        if key not in content_dict:
			
 
				+            if key not in optional_keys:
			
 
				+                retry_messages.append(f'Missing the key <{key}> in the answer.')
			
 
				+        else:
			
 
				+            val = content_dict[key]
			
 
				+            content_dict[key] = val[0]
			
 
				+            if len(val) > 1:
			
 
				+                if not merge_multiple:
			
 
				+                    retry_messages.append(
			
 
				+                        f'Found multiple instances of the key {key}. You should have only one of them.'
			
 
				+                    )
			
 
				+                else:
			
 
				+                    # merge the multiple instances
			
 
				+                    content_dict[key] = '\n'.join(val)
			
 
				+
			
 
				+    valid = len(retry_messages) == 0
			
 
				+    retry_message = '\n'.join(retry_messages)
			
 
				+    return content_dict, valid, retry_message
			
--- a/opendevin/controller/agent_controller.py
+++ b/opendevin/controller/agent_controller.py
@@ -112,6 +112,7 @@ class AgentController:
 
				             except Exception as e:
			
 
				                 traceback.print_exc()
			
 
				                 logger.error(f'Error while running the agent: {e}')
			
 
				+                logger.error(traceback.format_exc())
			
 
				                 await self.report_error(
			
 
				                     'There was an unexpected error while running the agent', exception=e
			
 
				                 )
			
--- a/opendevin/events/observation/browse.py
+++ b/opendevin/events/observation/browse.py
@@ -22,6 +22,7 @@ class BrowserOutputObservation(Observation):
 
				     dom_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
			
 
				     axtree_object: dict = field(default_factory=dict, repr=False)  # don't show in repr
			
 
				     last_browser_action: str = ''
			
 
				+    last_browser_action_error: str = ''
			
 
				     focused_element_bid: str = ''
			
 
				 
			
 
				     @property
			
--- a/opendevin/events/serialization/event.py
+++ b/opendevin/events/serialization/event.py
@@ -27,6 +27,7 @@ DELETE_FROM_MEMORY_EXTRAS = {
 
				     'open_pages_urls',
			
 
				     'active_page_index',
			
 
				     'last_browser_action',
			
 
				+    'last_browser_action_error',
			
 
				     'focused_element_bid',
			
 
				 }
			
 
				 
			
--- a/opendevin/runtime/browser/browser_env.py
+++ b/opendevin/runtime/browser/browser_env.py
@@ -37,6 +37,8 @@ class BrowserEnv:
 
				         )
			
 
				         logger.info('Starting browser env...')
			
 
				         self.process.start()
			
 
				+        if not self.check_alive():
			
 
				+            raise BrowserException('Failed to start browser environment.')
			
 
				         atexit.register(self.close)
			
 
				 
			
 
				     def browser_process(self):
			
@@ -58,6 +60,9 @@ class BrowserEnv:
 
				                         logger.info('SHUTDOWN recv, shutting down browser env...')
			
 
				                         env.close()
			
 
				                         return
			
 
				+                    elif unique_request_id == 'IS_ALIVE':
			
 
				+                        self.browser_side.send(('ALIVE', None))
			
 
				+                        continue
			
 
				                     action = action_data['action']
			
 
				                     obs, reward, terminated, truncated, info = env.step(action)
			
 
				                     # add text content of the page
			
@@ -86,10 +91,15 @@ class BrowserEnv:
 
				             if self.agent_side.poll(timeout=0.01):
			
 
				                 response_id, obs = self.agent_side.recv()
			
 
				                 if response_id == unique_request_id:
			
 
				-                    if obs['last_action_error']:
			
 
				-                        raise BrowserException(obs['last_action_error'])
			
 
				                     return obs
			
 
				 
			
 
				+    def check_alive(self, timeout: float = 10):
			
 
				+        self.agent_side.send(('IS_ALIVE', None))
			
 
				+        if self.agent_side.poll(timeout=timeout):
			
 
				+            response_id, _ = self.agent_side.recv()
			
 
				+            if response_id == 'ALIVE':
			
 
				+                return True
			
 
				+
			
 
				     def close(self):
			
 
				         if not self.process.is_alive():
			
 
				             logger.info('BrowserEnv already closed, no need to close again')
			
@@ -112,7 +122,9 @@ class BrowserEnv:
 
				             logger.error('Encountered an error when closing browser env', exc_info=True)
			
 
				 
			
 
				     @staticmethod
			
 
				-    def image_to_png_base64_url(image: np.ndarray | Image.Image):
			
 
				+    def image_to_png_base64_url(
			
 
				+        image: np.ndarray | Image.Image, add_data_prefix: bool = False
			
 
				+    ):
			
 
				         """Convert a numpy array to a base64 encoded png image url."""
			
 
				 
			
 
				         if isinstance(image, np.ndarray):
			
@@ -123,4 +135,28 @@ class BrowserEnv:
 
				         image.save(buffered, format='PNG')
			
 
				 
			
 
				         image_base64 = base64.b64encode(buffered.getvalue()).decode()
			
 
				-        return f'{image_base64}'
			
 
				+        return (
			
 
				+            f'data:image/png;base64,{image_base64}'
			
 
				+            if add_data_prefix
			
 
				+            else f'{image_base64}'
			
 
				+        )
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def image_to_jpg_base64_url(
			
 
				+        image: np.ndarray | Image.Image, add_data_prefix: bool = False
			
 
				+    ):
			
 
				+        """Convert a numpy array to a base64 encoded jpeg image url."""
			
 
				+
			
 
				+        if isinstance(image, np.ndarray):
			
 
				+            image = Image.fromarray(image)
			
 
				+        if image.mode in ('RGBA', 'LA'):
			
 
				+            image = image.convert('RGB')
			
 
				+        buffered = io.BytesIO()
			
 
				+        image.save(buffered, format='JPEG')
			
 
				+
			
 
				+        image_base64 = base64.b64encode(buffered.getvalue()).decode()
			
 
				+        return (
			
 
				+            f'data:image/jpeg;base64,{image_base64}'
			
 
				+            if add_data_prefix
			
 
				+            else f'{image_base64}'
			
 
				+        )
			
--- a/opendevin/runtime/server/browse.py
+++ b/opendevin/runtime/server/browse.py
@@ -2,9 +2,10 @@ import os
 
				 
			
 
				 from opendevin.core.schema import ActionType
			
 
				 from opendevin.events.observation import BrowserOutputObservation
			
 
				+from opendevin.runtime.browser.browser_env import BrowserEnv
			
 
				 
			
 
				 
			
 
				-async def browse(action, browser) -> BrowserOutputObservation:  # type: ignore
			
 
				+async def browse(action, browser: BrowserEnv) -> BrowserOutputObservation:  # type: ignore
			
 
				     if action.action == ActionType.BROWSE:
			
 
				         # legacy BrowseURLAction
			
 
				         asked_url = action.url
			
@@ -30,11 +31,16 @@ async def browse(action, browser) -> BrowserOutputObservation:  # type: ignore
 
				             focused_element_bid=obs['focused_element_bid'],  # focused element bid
			
 
				             screenshot=obs['screenshot'],  # base64-encoded screenshot, png
			
 
				             url=obs['url'],  # URL of the page
			
 
				+            error=True if obs['last_action_error'] else False,  # error flag
			
 
				+            last_browser_action_error=obs[
			
 
				+                'last_action_error'
			
 
				+            ],  # last browser env action error
			
 
				         )
			
 
				     except Exception as e:
			
 
				         return BrowserOutputObservation(
			
 
				             content=str(e),
			
 
				             screenshot='',
			
 
				             error=True,
			
 
				+            last_browser_action_error=str(e),
			
 
				             url=asked_url if action.action == ActionType.BROWSE else '',
			
 
				         )