codeact_agent.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515
  1. import json
  2. import os
  3. from collections import deque
  4. from itertools import islice
  5. from litellm import ModelResponse
  6. import openhands.agenthub.codeact_agent.function_calling as codeact_function_calling
  7. from openhands.agenthub.codeact_agent.action_parser import CodeActResponseParser
  8. from openhands.controller.agent import Agent
  9. from openhands.controller.state.state import State
  10. from openhands.core.config import AgentConfig
  11. from openhands.core.logger import openhands_logger as logger
  12. from openhands.core.message import ImageContent, Message, TextContent
  13. from openhands.events.action import (
  14. Action,
  15. AgentDelegateAction,
  16. AgentFinishAction,
  17. BrowseInteractiveAction,
  18. CmdRunAction,
  19. FileEditAction,
  20. IPythonRunCellAction,
  21. MessageAction,
  22. )
  23. from openhands.events.observation import (
  24. AgentDelegateObservation,
  25. BrowserOutputObservation,
  26. CmdOutputObservation,
  27. FileEditObservation,
  28. IPythonRunCellObservation,
  29. UserRejectObservation,
  30. )
  31. from openhands.events.observation.error import ErrorObservation
  32. from openhands.events.observation.observation import Observation
  33. from openhands.events.serialization.event import truncate_content
  34. from openhands.llm.llm import LLM
  35. from openhands.runtime.plugins import (
  36. AgentSkillsRequirement,
  37. JupyterRequirement,
  38. PluginRequirement,
  39. )
  40. from openhands.utils.microagent import MicroAgent
  41. from openhands.utils.prompt import PromptManager
  42. class CodeActAgent(Agent):
  43. VERSION = '2.2'
  44. """
  45. The Code Act Agent is a minimalist agent.
  46. The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
  47. ### Overview
  48. This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
  49. The conceptual idea is illustrated below. At each turn, the agent can:
  50. 1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
  51. 2. **CodeAct**: Choose to perform the task by executing code
  52. - Execute any valid Linux `bash` command
  53. - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
  54. ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
  55. """
  56. sandbox_plugins: list[PluginRequirement] = [
  57. # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
  58. # AgentSkillsRequirement provides a lot of Python functions,
  59. # and it needs to be initialized before Jupyter for Jupyter to use those functions.
  60. AgentSkillsRequirement(),
  61. JupyterRequirement(),
  62. ]
  63. obs_prefix = 'OBSERVATION:\n'
  64. def __init__(
  65. self,
  66. llm: LLM,
  67. config: AgentConfig,
  68. ) -> None:
  69. """Initializes a new instance of the CodeActAgent class.
  70. Parameters:
  71. - llm (LLM): The llm to be used by this agent
  72. """
  73. super().__init__(llm, config)
  74. self.reset()
  75. self.micro_agent = (
  76. MicroAgent(
  77. os.path.join(
  78. os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
  79. )
  80. )
  81. if config.micro_agent_name
  82. else None
  83. )
  84. self.function_calling_active = self.config.function_calling
  85. if self.function_calling_active and not self.llm.is_function_calling_active():
  86. logger.warning(
  87. f'Function calling not supported for model {self.llm.config.model}. '
  88. 'Disabling function calling.'
  89. )
  90. self.function_calling_active = False
  91. if self.function_calling_active:
  92. # Function calling mode
  93. self.tools = codeact_function_calling.get_tools(
  94. codeact_enable_browsing=self.config.codeact_enable_browsing,
  95. codeact_enable_jupyter=self.config.codeact_enable_jupyter,
  96. codeact_enable_llm_editor=self.config.codeact_enable_llm_editor,
  97. )
  98. logger.debug(
  99. f'TOOLS loaded for CodeActAgent: {json.dumps(self.tools, indent=2)}'
  100. )
  101. self.system_prompt = codeact_function_calling.SYSTEM_PROMPT
  102. self.initial_user_message = None
  103. else:
  104. # Non-function-calling mode
  105. self.action_parser = CodeActResponseParser()
  106. self.prompt_manager = PromptManager(
  107. prompt_dir=os.path.join(os.path.dirname(__file__)),
  108. agent_skills_docs=AgentSkillsRequirement.documentation,
  109. micro_agent=self.micro_agent,
  110. )
  111. self.system_prompt = self.prompt_manager.system_message
  112. self.initial_user_message = self.prompt_manager.initial_user_message
  113. self.pending_actions: deque[Action] = deque()
  114. def get_action_message(
  115. self,
  116. action: Action,
  117. pending_tool_call_action_messages: dict[str, Message],
  118. ) -> list[Message]:
  119. """Converts an action into a message format that can be sent to the LLM.
  120. This method handles different types of actions and formats them appropriately:
  121. 1. For tool-based actions (AgentDelegate, CmdRun, IPythonRunCell, FileEdit) and agent-sourced AgentFinish:
  122. - In function calling mode: Stores the LLM's response in pending_tool_call_action_messages
  123. - In non-function calling mode: Creates a message with the action string
  124. 2. For MessageActions: Creates a message with the text content and optional image content
  125. Args:
  126. action (Action): The action to convert. Can be one of:
  127. - CmdRunAction: For executing bash commands
  128. - IPythonRunCellAction: For running IPython code
  129. - FileEditAction: For editing files
  130. - BrowseInteractiveAction: For browsing the web
  131. - AgentFinishAction: For ending the interaction
  132. - MessageAction: For sending messages
  133. pending_tool_call_action_messages (dict[str, Message]): Dictionary mapping response IDs
  134. to their corresponding messages. Used in function calling mode to track tool calls
  135. that are waiting for their results.
  136. Returns:
  137. list[Message]: A list containing the formatted message(s) for the action.
  138. May be empty if the action is handled as a tool call in function calling mode.
  139. Note:
  140. In function calling mode, tool-based actions are stored in pending_tool_call_action_messages
  141. rather than being returned immediately. They will be processed later when all corresponding
  142. tool call results are available.
  143. """
  144. # create a regular message from an event
  145. if isinstance(
  146. action,
  147. (
  148. AgentDelegateAction,
  149. CmdRunAction,
  150. IPythonRunCellAction,
  151. FileEditAction,
  152. BrowseInteractiveAction,
  153. ),
  154. ) or (isinstance(action, AgentFinishAction) and action.source == 'agent'):
  155. if self.function_calling_active:
  156. tool_metadata = action.tool_call_metadata
  157. assert tool_metadata is not None, (
  158. 'Tool call metadata should NOT be None when function calling is enabled. Action: '
  159. + str(action)
  160. )
  161. llm_response: ModelResponse = tool_metadata.model_response
  162. assistant_msg = llm_response.choices[0].message
  163. # Add the LLM message (assistant) that initiated the tool calls
  164. # (overwrites any previous message with the same response_id)
  165. pending_tool_call_action_messages[llm_response.id] = Message(
  166. role=assistant_msg.role,
  167. # tool call content SHOULD BE a string
  168. content=[TextContent(text=assistant_msg.content or '')]
  169. if assistant_msg.content is not None
  170. else [],
  171. tool_calls=assistant_msg.tool_calls,
  172. )
  173. return []
  174. else:
  175. assert not isinstance(action, BrowseInteractiveAction), (
  176. 'BrowseInteractiveAction is not supported in non-function calling mode. Action: '
  177. + str(action)
  178. )
  179. content = [TextContent(text=self.action_parser.action_to_str(action))]
  180. return [
  181. Message(
  182. role='user' if action.source == 'user' else 'assistant',
  183. content=content,
  184. )
  185. ]
  186. elif isinstance(action, MessageAction):
  187. role = 'user' if action.source == 'user' else 'assistant'
  188. content = [TextContent(text=action.content or '')]
  189. if self.llm.vision_is_active() and action.images_urls:
  190. content.append(ImageContent(image_urls=action.images_urls))
  191. return [
  192. Message(
  193. role=role,
  194. content=content,
  195. )
  196. ]
  197. return []
  198. def get_observation_message(
  199. self,
  200. obs: Observation,
  201. tool_call_id_to_message: dict[str, Message],
  202. ) -> list[Message]:
  203. """Converts an observation into a message format that can be sent to the LLM.
  204. This method handles different types of observations and formats them appropriately:
  205. - CmdOutputObservation: Formats command execution results with exit codes
  206. - IPythonRunCellObservation: Formats IPython cell execution results, replacing base64 images
  207. - FileEditObservation: Formats file editing results
  208. - AgentDelegateObservation: Formats results from delegated agent tasks
  209. - ErrorObservation: Formats error messages from failed actions
  210. - UserRejectObservation: Formats user rejection messages
  211. In function calling mode, observations with tool_call_metadata are stored in
  212. tool_call_id_to_message for later processing instead of being returned immediately.
  213. Args:
  214. obs (Observation): The observation to convert
  215. tool_call_id_to_message (dict[str, Message]): Dictionary mapping tool call IDs
  216. to their corresponding messages (used in function calling mode)
  217. Returns:
  218. list[Message]: A list containing the formatted message(s) for the observation.
  219. May be empty if the observation is handled as a tool response in function calling mode.
  220. Raises:
  221. ValueError: If the observation type is unknown
  222. """
  223. message: Message
  224. max_message_chars = self.llm.config.max_message_chars
  225. obs_prefix = 'OBSERVATION:\n'
  226. if isinstance(obs, CmdOutputObservation):
  227. text = obs_prefix + truncate_content(
  228. obs.content + obs.interpreter_details, max_message_chars
  229. )
  230. text += f'\n[Command finished with exit code {obs.exit_code}]'
  231. message = Message(role='user', content=[TextContent(text=text)])
  232. elif isinstance(obs, IPythonRunCellObservation):
  233. text = obs_prefix + obs.content
  234. # replace base64 images with a placeholder
  235. splitted = text.split('\n')
  236. for i, line in enumerate(splitted):
  237. if '![image](data:image/png;base64,' in line:
  238. splitted[i] = (
  239. '![image](data:image/png;base64, ...) already displayed to user'
  240. )
  241. text = '\n'.join(splitted)
  242. text = truncate_content(text, max_message_chars)
  243. message = Message(role='user', content=[TextContent(text=text)])
  244. elif isinstance(obs, FileEditObservation):
  245. text = obs_prefix + truncate_content(str(obs), max_message_chars)
  246. message = Message(role='user', content=[TextContent(text=text)])
  247. elif isinstance(obs, BrowserOutputObservation):
  248. text = obs.get_agent_obs_text()
  249. message = Message(
  250. role='user',
  251. content=[TextContent(text=obs_prefix + text)],
  252. )
  253. elif isinstance(obs, AgentDelegateObservation):
  254. text = obs_prefix + truncate_content(
  255. obs.outputs['content'] if 'content' in obs.outputs else '',
  256. max_message_chars,
  257. )
  258. message = Message(role='user', content=[TextContent(text=text)])
  259. elif isinstance(obs, ErrorObservation):
  260. text = obs_prefix + truncate_content(obs.content, max_message_chars)
  261. text += '\n[Error occurred in processing last action]'
  262. message = Message(role='user', content=[TextContent(text=text)])
  263. elif isinstance(obs, UserRejectObservation):
  264. text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
  265. text += '\n[Last action has been rejected by the user]'
  266. message = Message(role='user', content=[TextContent(text=text)])
  267. else:
  268. # If an observation message is not returned, it will cause an error
  269. # when the LLM tries to return the next message
  270. raise ValueError(f'Unknown observation type: {type(obs)}')
  271. if self.function_calling_active:
  272. # Update the message as tool response properly
  273. if (tool_call_metadata := obs.tool_call_metadata) is not None:
  274. tool_call_id_to_message[tool_call_metadata.tool_call_id] = Message(
  275. role='tool',
  276. content=message.content,
  277. tool_call_id=tool_call_metadata.tool_call_id,
  278. name=tool_call_metadata.function_name,
  279. )
  280. # No need to return the observation message
  281. # because it will be added by get_action_message when all the corresponding
  282. # tool calls in the SAME request are processed
  283. return []
  284. return [message]
  285. def reset(self) -> None:
  286. """Resets the CodeAct Agent."""
  287. super().reset()
  288. def step(self, state: State) -> Action:
  289. """Performs one step using the CodeAct Agent.
  290. This includes gathering info on previous steps and prompting the model to make a command to execute.
  291. Parameters:
  292. - state (State): used to get updated info
  293. Returns:
  294. - CmdRunAction(command) - bash command to run
  295. - IPythonRunCellAction(code) - IPython code to run
  296. - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
  297. - MessageAction(content) - Message action to run (e.g. ask for clarification)
  298. - AgentFinishAction() - end the interaction
  299. """
  300. # Continue with pending actions if any
  301. if self.pending_actions:
  302. return self.pending_actions.popleft()
  303. # if we're done, go back
  304. last_user_message = state.get_last_user_message()
  305. if last_user_message and last_user_message.strip() == '/exit':
  306. return AgentFinishAction()
  307. # prepare what we want to send to the LLM
  308. messages = self._get_messages(state)
  309. params: dict = {
  310. 'messages': self.llm.format_messages_for_llm(messages),
  311. }
  312. if self.function_calling_active:
  313. params['tools'] = self.tools
  314. params['parallel_tool_calls'] = False
  315. else:
  316. params['stop'] = [
  317. '</execute_ipython>',
  318. '</execute_bash>',
  319. '</execute_browse>',
  320. '</file_edit>',
  321. ]
  322. response = self.llm.completion(**params)
  323. if self.function_calling_active:
  324. actions = codeact_function_calling.response_to_actions(response)
  325. for action in actions:
  326. self.pending_actions.append(action)
  327. return self.pending_actions.popleft()
  328. else:
  329. return self.action_parser.parse(response)
  330. def _get_messages(self, state: State) -> list[Message]:
  331. """Constructs the message history for the LLM conversation.
  332. This method builds a structured conversation history by processing events from the state
  333. and formatting them into messages that the LLM can understand. It handles both regular
  334. message flow and function-calling scenarios.
  335. The method performs the following steps:
  336. 1. Initializes with system prompt and optional initial user message
  337. 2. Processes events (Actions and Observations) into messages
  338. 3. Handles tool calls and their responses in function-calling mode
  339. 4. Manages message role alternation (user/assistant/tool)
  340. 5. Applies caching for specific LLM providers (e.g., Anthropic)
  341. 6. Adds environment reminders for non-function-calling mode
  342. Args:
  343. state (State): The current state object containing conversation history and other metadata
  344. Returns:
  345. list[Message]: A list of formatted messages ready for LLM consumption, including:
  346. - System message with prompt
  347. - Initial user message (if configured)
  348. - Action messages (from both user and assistant)
  349. - Observation messages (including tool responses)
  350. - Environment reminders (in non-function-calling mode)
  351. Note:
  352. - In function-calling mode, tool calls and their responses are carefully tracked
  353. to maintain proper conversation flow
  354. - Messages from the same role are combined to prevent consecutive same-role messages
  355. - For Anthropic models, specific messages are cached according to their documentation
  356. """
  357. messages: list[Message] = [
  358. Message(
  359. role='system',
  360. content=[
  361. TextContent(
  362. text=self.system_prompt,
  363. cache_prompt=self.llm.is_caching_prompt_active(), # Cache system prompt
  364. )
  365. ],
  366. )
  367. ]
  368. if self.initial_user_message:
  369. messages.append(
  370. Message(
  371. role='user',
  372. content=[TextContent(text=self.initial_user_message)],
  373. )
  374. )
  375. pending_tool_call_action_messages: dict[str, Message] = {}
  376. tool_call_id_to_message: dict[str, Message] = {}
  377. events = list(state.history)
  378. for event in events:
  379. # create a regular message from an event
  380. if isinstance(event, Action):
  381. messages_to_add = self.get_action_message(
  382. action=event,
  383. pending_tool_call_action_messages=pending_tool_call_action_messages,
  384. )
  385. elif isinstance(event, Observation):
  386. messages_to_add = self.get_observation_message(
  387. obs=event,
  388. tool_call_id_to_message=tool_call_id_to_message,
  389. )
  390. else:
  391. raise ValueError(f'Unknown event type: {type(event)}')
  392. # Check pending tool call action messages and see if they are complete
  393. _response_ids_to_remove = []
  394. for (
  395. response_id,
  396. pending_message,
  397. ) in pending_tool_call_action_messages.items():
  398. assert pending_message.tool_calls is not None, (
  399. 'Tool calls should NOT be None when function calling is enabled & the message is considered pending tool call. '
  400. f'Pending message: {pending_message}'
  401. )
  402. if all(
  403. tool_call.id in tool_call_id_to_message
  404. for tool_call in pending_message.tool_calls
  405. ):
  406. # If complete:
  407. # -- 1. Add the message that **initiated** the tool calls
  408. messages_to_add.append(pending_message)
  409. # -- 2. Add the tool calls **results***
  410. for tool_call in pending_message.tool_calls:
  411. messages_to_add.append(tool_call_id_to_message[tool_call.id])
  412. tool_call_id_to_message.pop(tool_call.id)
  413. _response_ids_to_remove.append(response_id)
  414. # Cleanup the processed pending tool messages
  415. for response_id in _response_ids_to_remove:
  416. pending_tool_call_action_messages.pop(response_id)
  417. for message in messages_to_add:
  418. # add regular message
  419. if message:
  420. # handle error if the message is the SAME role as the previous message
  421. # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
  422. # there shouldn't be two consecutive messages from the same role
  423. # NOTE: we shouldn't combine tool messages because each of them has a different tool_call_id
  424. if (
  425. messages
  426. and messages[-1].role == message.role
  427. and message.role != 'tool'
  428. ):
  429. messages[-1].content.extend(message.content)
  430. else:
  431. messages.append(message)
  432. if self.llm.is_caching_prompt_active():
  433. # NOTE: this is only needed for anthropic
  434. # following logic here:
  435. # https://github.com/anthropics/anthropic-quickstarts/blob/8f734fd08c425c6ec91ddd613af04ff87d70c5a0/computer-use-demo/computer_use_demo/loop.py#L241-L262
  436. breakpoints_remaining = 3 # remaining 1 for system/tool
  437. for message in reversed(messages):
  438. if message.role == 'user' or message.role == 'tool':
  439. if breakpoints_remaining > 0:
  440. message.content[
  441. -1
  442. ].cache_prompt = True # Last item inside the message content
  443. breakpoints_remaining -= 1
  444. else:
  445. break
  446. if not self.function_calling_active:
  447. # The latest user message is important:
  448. # we want to remind the agent of the environment constraints
  449. latest_user_message = next(
  450. islice(
  451. (
  452. m
  453. for m in reversed(messages)
  454. if m.role == 'user'
  455. and any(isinstance(c, TextContent) for c in m.content)
  456. ),
  457. 1,
  458. ),
  459. None,
  460. )
  461. # do not add this for function calling
  462. if latest_user_message:
  463. reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
  464. latest_user_message.content.append(TextContent(text=reminder_text))
  465. return messages