codeact_agent.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. import json
  2. import os
  3. from collections import deque
  4. from litellm import ModelResponse
  5. import openhands.agenthub.codeact_agent.function_calling as codeact_function_calling
  6. from openhands.controller.agent import Agent
  7. from openhands.controller.state.state import State
  8. from openhands.core.config import AgentConfig
  9. from openhands.core.logger import openhands_logger as logger
  10. from openhands.core.message import ImageContent, Message, TextContent
  11. from openhands.events.action import (
  12. Action,
  13. AgentDelegateAction,
  14. AgentFinishAction,
  15. BrowseInteractiveAction,
  16. CmdRunAction,
  17. FileEditAction,
  18. IPythonRunCellAction,
  19. MessageAction,
  20. )
  21. from openhands.events.observation import (
  22. AgentDelegateObservation,
  23. BrowserOutputObservation,
  24. CmdOutputObservation,
  25. FileEditObservation,
  26. IPythonRunCellObservation,
  27. UserRejectObservation,
  28. )
  29. from openhands.events.observation.error import ErrorObservation
  30. from openhands.events.observation.observation import Observation
  31. from openhands.events.serialization.event import truncate_content
  32. from openhands.llm.llm import LLM
  33. from openhands.runtime.plugins import (
  34. AgentSkillsRequirement,
  35. JupyterRequirement,
  36. PluginRequirement,
  37. )
  38. from openhands.utils.prompt import PromptManager
  39. class CodeActAgent(Agent):
  40. VERSION = '2.2'
  41. """
  42. The Code Act Agent is a minimalist agent.
  43. The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
  44. ### Overview
  45. This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
  46. The conceptual idea is illustrated below. At each turn, the agent can:
  47. 1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
  48. 2. **CodeAct**: Choose to perform the task by executing code
  49. - Execute any valid Linux `bash` command
  50. - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
  51. ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
  52. """
  53. sandbox_plugins: list[PluginRequirement] = [
  54. # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
  55. # AgentSkillsRequirement provides a lot of Python functions,
  56. # and it needs to be initialized before Jupyter for Jupyter to use those functions.
  57. AgentSkillsRequirement(),
  58. JupyterRequirement(),
  59. ]
  60. def __init__(
  61. self,
  62. llm: LLM,
  63. config: AgentConfig,
  64. ) -> None:
  65. """Initializes a new instance of the CodeActAgent class.
  66. Parameters:
  67. - llm (LLM): The llm to be used by this agent
  68. """
  69. super().__init__(llm, config)
  70. self.reset()
  71. self.mock_function_calling = False
  72. if not self.llm.is_function_calling_active():
  73. logger.info(
  74. f'Function calling not enabled for model {self.llm.config.model}. '
  75. 'Mocking function calling via prompting.'
  76. )
  77. self.mock_function_calling = True
  78. # Function calling mode
  79. self.tools = codeact_function_calling.get_tools(
  80. codeact_enable_browsing=self.config.codeact_enable_browsing,
  81. codeact_enable_jupyter=self.config.codeact_enable_jupyter,
  82. codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
  83. )
  84. logger.debug(
  85. f'TOOLS loaded for CodeActAgent: {json.dumps(self.tools, indent=2)}'
  86. )
  87. self.prompt_manager = PromptManager(
  88. microagent_dir=os.path.join(os.path.dirname(__file__), 'micro')
  89. if self.config.use_microagents
  90. else None,
  91. prompt_dir=os.path.join(os.path.dirname(__file__), 'prompts'),
  92. disabled_microagents=self.config.disabled_microagents,
  93. )
  94. self.pending_actions: deque[Action] = deque()
  95. def get_action_message(
  96. self,
  97. action: Action,
  98. pending_tool_call_action_messages: dict[str, Message],
  99. ) -> list[Message]:
  100. """Converts an action into a message format that can be sent to the LLM.
  101. This method handles different types of actions and formats them appropriately:
  102. 1. For tool-based actions (AgentDelegate, CmdRun, IPythonRunCell, FileEdit) and agent-sourced AgentFinish:
  103. - In function calling mode: Stores the LLM's response in pending_tool_call_action_messages
  104. - In non-function calling mode: Creates a message with the action string
  105. 2. For MessageActions: Creates a message with the text content and optional image content
  106. Args:
  107. action (Action): The action to convert. Can be one of:
  108. - CmdRunAction: For executing bash commands
  109. - IPythonRunCellAction: For running IPython code
  110. - FileEditAction: For editing files
  111. - BrowseInteractiveAction: For browsing the web
  112. - AgentFinishAction: For ending the interaction
  113. - MessageAction: For sending messages
  114. pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
  115. to their corresponding messages. Used in function calling mode to track tool calls
  116. that are waiting for their results.
  117. Returns:
  118. list[Message]: A list containing the formatted message(s) for the action.
  119. May be empty if the action is handled as a tool call in function calling mode.
  120. Note:
  121. In function calling mode, tool-based actions are stored in pending_tool_call_action_messages
  122. rather than being returned immediately. They will be processed later when all corresponding
  123. tool call results are available.
  124. """
  125. # create a regular message from an event
  126. if isinstance(
  127. action,
  128. (
  129. AgentDelegateAction,
  130. IPythonRunCellAction,
  131. FileEditAction,
  132. BrowseInteractiveAction,
  133. ),
  134. ) or (
  135. isinstance(action, (AgentFinishAction, CmdRunAction))
  136. and action.source == 'agent'
  137. ):
  138. tool_metadata = action.tool_call_metadata
  139. assert tool_metadata is not None, (
  140. 'Tool call metadata should NOT be None when function calling is enabled. Action: '
  141. + str(action)
  142. )
  143. llm_response: ModelResponse = tool_metadata.model_response
  144. assistant_msg = llm_response.choices[0].message
  145. # Add the LLM message (assistant) that initiated the tool calls
  146. # (overwrites any previous message with the same response_id)
  147. pending_tool_call_action_messages[llm_response.id] = Message(
  148. role=assistant_msg.role,
  149. # tool call content SHOULD BE a string
  150. content=[TextContent(text=assistant_msg.content or '')]
  151. if assistant_msg.content is not None
  152. else [],
  153. tool_calls=assistant_msg.tool_calls,
  154. )
  155. return []
  156. elif isinstance(action, MessageAction):
  157. role = 'user' if action.source == 'user' else 'assistant'
  158. content = [TextContent(text=action.content or '')]
  159. if self.llm.vision_is_active() and action.image_urls:
  160. content.append(ImageContent(image_urls=action.image_urls))
  161. return [
  162. Message(
  163. role=role,
  164. content=content,
  165. )
  166. ]
  167. elif isinstance(action, CmdRunAction) and action.source == 'user':
  168. content = [TextContent(text=f'User executed the command:\n{action.command}')]
  169. return [
  170. Message(
  171. role='user',
  172. content=content,
  173. )
  174. ]
  175. return []
  176. def get_observation_message(
  177. self,
  178. obs: Observation,
  179. tool_call_id_to_message: dict[str, Message],
  180. ) -> list[Message]:
  181. """Converts an observation into a message format that can be sent to the LLM.
  182. This method handles different types of observations and formats them appropriately:
  183. - CmdOutputObservation: Formats command execution results with exit codes
  184. - IPythonRunCellObservation: Formats IPython cell execution results, replacing base64 images
  185. - FileEditObservation: Formats file editing results
  186. - AgentDelegateObservation: Formats results from delegated agent tasks
  187. - ErrorObservation: Formats error messages from failed actions
  188. - UserRejectObservation: Formats user rejection messages
  189. In function calling mode, observations with tool_call_metadata are stored in
  190. tool_call_id_to_message for later processing instead of being returned immediately.
  191. Args:
  192. obs (Observation): The observation to convert
  193. tool_call_id_to_message (dict[str, Message]): Dictionary mapping tool call IDs
  194. to their corresponding messages (used in function calling mode)
  195. Returns:
  196. list[Message]: A list containing the formatted message(s) for the observation.
  197. May be empty if the observation is handled as a tool response in function calling mode.
  198. Raises:
  199. ValueError: If the observation type is unknown
  200. """
  201. message: Message
  202. max_message_chars = self.llm.config.max_message_chars
  203. if isinstance(obs, CmdOutputObservation):
  204. # if it doesn't have tool call metadata, it was triggered by a user action
  205. if obs.tool_call_metadata is None:
  206. text = truncate_content(
  207. f'\nObserved result of command executed by user:\n{obs.content}',
  208. max_message_chars,
  209. )
  210. else:
  211. text = truncate_content(
  212. obs.content + obs.interpreter_details, max_message_chars
  213. )
  214. text += f'\n[Command finished with exit code {obs.exit_code}]'
  215. message = Message(role='user', content=[TextContent(text=text)])
  216. elif isinstance(obs, IPythonRunCellObservation):
  217. text = obs.content
  218. # replace base64 images with a placeholder
  219. splitted = text.split('\n')
  220. for i, line in enumerate(splitted):
  221. if '![image](data:image/png;base64,' in line:
  222. splitted[i] = (
  223. '![image](data:image/png;base64, ...) already displayed to user'
  224. )
  225. text = '\n'.join(splitted)
  226. text = truncate_content(text, max_message_chars)
  227. message = Message(role='user', content=[TextContent(text=text)])
  228. elif isinstance(obs, FileEditObservation):
  229. text = truncate_content(str(obs), max_message_chars)
  230. message = Message(role='user', content=[TextContent(text=text)])
  231. elif isinstance(obs, BrowserOutputObservation):
  232. text = obs.get_agent_obs_text()
  233. message = Message(
  234. role='user',
  235. content=[TextContent(text=text)],
  236. )
  237. elif isinstance(obs, AgentDelegateObservation):
  238. text = truncate_content(
  239. obs.outputs['content'] if 'content' in obs.outputs else '',
  240. max_message_chars,
  241. )
  242. message = Message(role='user', content=[TextContent(text=text)])
  243. elif isinstance(obs, ErrorObservation):
  244. text = truncate_content(obs.content, max_message_chars)
  245. text += '\n[Error occurred in processing last action]'
  246. message = Message(role='user', content=[TextContent(text=text)])
  247. elif isinstance(obs, UserRejectObservation):
  248. text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
  249. text += '\n[Last action has been rejected by the user]'
  250. message = Message(role='user', content=[TextContent(text=text)])
  251. else:
  252. # If an observation message is not returned, it will cause an error
  253. # when the LLM tries to return the next message
  254. raise ValueError(f'Unknown observation type: {type(obs)}')
  255. # Update the message as tool response properly
  256. if (tool_call_metadata := obs.tool_call_metadata) is not None:
  257. tool_call_id_to_message[tool_call_metadata.tool_call_id] = Message(
  258. role='tool',
  259. content=message.content,
  260. tool_call_id=tool_call_metadata.tool_call_id,
  261. name=tool_call_metadata.function_name,
  262. )
  263. # No need to return the observation message
  264. # because it will be added by get_action_message when all the corresponding
  265. # tool calls in the SAME request are processed
  266. return []
  267. return [message]
  268. def reset(self) -> None:
  269. """Resets the CodeAct Agent."""
  270. super().reset()
  271. def step(self, state: State) -> Action:
  272. """Performs one step using the CodeAct Agent.
  273. This includes gathering info on previous steps and prompting the model to make a command to execute.
  274. Parameters:
  275. - state (State): used to get updated info
  276. Returns:
  277. - CmdRunAction(command) - bash command to run
  278. - IPythonRunCellAction(code) - IPython code to run
  279. - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
  280. - MessageAction(content) - Message action to run (e.g. ask for clarification)
  281. - AgentFinishAction() - end the interaction
  282. """
  283. # Continue with pending actions if any
  284. if self.pending_actions:
  285. return self.pending_actions.popleft()
  286. # if we're done, go back
  287. latest_user_message = state.get_last_user_message()
  288. if latest_user_message and latest_user_message.content.strip() == '/exit':
  289. return AgentFinishAction()
  290. # prepare what we want to send to the LLM
  291. messages = self._get_messages(state)
  292. params: dict = {
  293. 'messages': self.llm.format_messages_for_llm(messages),
  294. }
  295. params['tools'] = self.tools
  296. if self.mock_function_calling:
  297. params['mock_function_calling'] = True
  298. response = self.llm.completion(**params)
  299. actions = codeact_function_calling.response_to_actions(response)
  300. for action in actions:
  301. self.pending_actions.append(action)
  302. return self.pending_actions.popleft()
  303. def _get_messages(self, state: State) -> list[Message]:
  304. """Constructs the message history for the LLM conversation.
  305. This method builds a structured conversation history by processing events from the state
  306. and formatting them into messages that the LLM can understand. It handles both regular
  307. message flow and function-calling scenarios.
  308. The method performs the following steps:
  309. 1. Initializes with system prompt and optional initial user message
  310. 2. Processes events (Actions and Observations) into messages
  311. 3. Handles tool calls and their responses in function-calling mode
  312. 4. Manages message role alternation (user/assistant/tool)
  313. 5. Applies caching for specific LLM providers (e.g., Anthropic)
  314. 6. Adds environment reminders for non-function-calling mode
  315. Args:
  316. state (State): The current state object containing conversation history and other metadata
  317. Returns:
  318. list[Message]: A list of formatted messages ready for LLM consumption, including:
  319. - System message with prompt
  320. - Initial user message (if configured)
  321. - Action messages (from both user and assistant)
  322. - Observation messages (including tool responses)
  323. - Environment reminders (in non-function-calling mode)
  324. Note:
  325. - In function-calling mode, tool calls and their responses are carefully tracked
  326. to maintain proper conversation flow
  327. - Messages from the same role are combined to prevent consecutive same-role messages
  328. - For Anthropic models, specific messages are cached according to their documentation
  329. """
  330. messages: list[Message] = [
  331. Message(
  332. role='system',
  333. content=[
  334. TextContent(
  335. text=self.prompt_manager.get_system_message(),
  336. cache_prompt=self.llm.is_caching_prompt_active(),
  337. )
  338. ],
  339. )
  340. ]
  341. example_message = self.prompt_manager.get_example_user_message()
  342. if example_message:
  343. messages.append(
  344. Message(
  345. role='user',
  346. content=[TextContent(text=example_message)],
  347. cache_prompt=self.llm.is_caching_prompt_active(),
  348. )
  349. )
  350. pending_tool_call_action_messages: dict[str, Message] = {}
  351. tool_call_id_to_message: dict[str, Message] = {}
  352. events = list(state.history)
  353. for event in events:
  354. # create a regular message from an event
  355. if isinstance(event, Action):
  356. messages_to_add = self.get_action_message(
  357. action=event,
  358. pending_tool_call_action_messages=pending_tool_call_action_messages,
  359. )
  360. elif isinstance(event, Observation):
  361. messages_to_add = self.get_observation_message(
  362. obs=event,
  363. tool_call_id_to_message=tool_call_id_to_message,
  364. )
  365. else:
  366. raise ValueError(f'Unknown event type: {type(event)}')
  367. # Check pending tool call action messages and see if they are complete
  368. _response_ids_to_remove = []
  369. for (
  370. response_id,
  371. pending_message,
  372. ) in pending_tool_call_action_messages.items():
  373. assert pending_message.tool_calls is not None, (
  374. 'Tool calls should NOT be None when function calling is enabled & the message is considered pending tool call. '
  375. f'Pending message: {pending_message}'
  376. )
  377. if all(
  378. tool_call.id in tool_call_id_to_message
  379. for tool_call in pending_message.tool_calls
  380. ):
  381. # If complete:
  382. # -- 1. Add the message that **initiated** the tool calls
  383. messages_to_add.append(pending_message)
  384. # -- 2. Add the tool calls **results***
  385. for tool_call in pending_message.tool_calls:
  386. messages_to_add.append(tool_call_id_to_message[tool_call.id])
  387. tool_call_id_to_message.pop(tool_call.id)
  388. _response_ids_to_remove.append(response_id)
  389. # Cleanup the processed pending tool messages
  390. for response_id in _response_ids_to_remove:
  391. pending_tool_call_action_messages.pop(response_id)
  392. for message in messages_to_add:
  393. if message:
  394. if message.role == 'user':
  395. self.prompt_manager.enhance_message(message)
  396. # handle error if the message is the SAME role as the previous message
  397. # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
  398. # there shouldn't be two consecutive messages from the same role
  399. # NOTE: we shouldn't combine tool messages because each of them has a different tool_call_id
  400. if (
  401. messages
  402. and messages[-1].role == message.role
  403. and message.role != 'tool'
  404. ):
  405. messages[-1].content.extend(message.content)
  406. else:
  407. messages.append(message)
  408. if self.llm.is_caching_prompt_active():
  409. # NOTE: this is only needed for anthropic
  410. # following logic here:
  411. # https://github.com/anthropics/anthropic-quickstarts/blob/8f734fd08c425c6ec91ddd613af04ff87d70c5a0/computer-use-demo/computer_use_demo/loop.py#L241-L262
  412. breakpoints_remaining = 3 # remaining 1 for system/tool
  413. for message in reversed(messages):
  414. if message.role == 'user' or message.role == 'tool':
  415. if breakpoints_remaining > 0:
  416. message.content[
  417. -1
  418. ].cache_prompt = True # Last item inside the message content
  419. breakpoints_remaining -= 1
  420. else:
  421. break
  422. return messages