codeact_agent.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. import os
  2. from itertools import islice
  3. from openhands.agenthub.codeact_agent.action_parser import CodeActResponseParser
  4. from openhands.controller.agent import Agent
  5. from openhands.controller.state.state import State
  6. from openhands.core.config import AgentConfig
  7. from openhands.core.message import ImageContent, Message, TextContent
  8. from openhands.events.action import (
  9. Action,
  10. AgentDelegateAction,
  11. AgentFinishAction,
  12. CmdRunAction,
  13. FileEditAction,
  14. IPythonRunCellAction,
  15. MessageAction,
  16. )
  17. from openhands.events.observation import (
  18. AgentDelegateObservation,
  19. CmdOutputObservation,
  20. FileEditObservation,
  21. IPythonRunCellObservation,
  22. UserRejectObservation,
  23. )
  24. from openhands.events.observation.error import ErrorObservation
  25. from openhands.events.observation.observation import Observation
  26. from openhands.events.serialization.event import truncate_content
  27. from openhands.llm.llm import LLM
  28. from openhands.runtime.plugins import (
  29. AgentSkillsRequirement,
  30. JupyterRequirement,
  31. PluginRequirement,
  32. )
  33. from openhands.utils.microagent import MicroAgent
  34. from openhands.utils.prompt import PromptManager
  35. class CodeActAgent(Agent):
  36. VERSION = '2.0'
  37. """
  38. The Code Act Agent is a minimalist agent.
  39. The agent works by passing the model a list of action-observation pairs and prompting the model to take the next step.
  40. ### Overview
  41. This agent implements the CodeAct idea ([paper](https://arxiv.org/abs/2402.01030), [tweet](https://twitter.com/xingyaow_/status/1754556835703751087)) that consolidates LLM agents’ **act**ions into a unified **code** action space for both *simplicity* and *performance* (see paper for more details).
  42. The conceptual idea is illustrated below. At each turn, the agent can:
  43. 1. **Converse**: Communicate with humans in natural language to ask for clarification, confirmation, etc.
  44. 2. **CodeAct**: Choose to perform the task by executing code
  45. - Execute any valid Linux `bash` command
  46. - Execute any valid `Python` code with [an interactive Python interpreter](https://ipython.org/). This is simulated through `bash` command, see plugin system below for more details.
  47. ![image](https://github.com/All-Hands-AI/OpenHands/assets/38853559/92b622e3-72ad-4a61-8f41-8c040b6d5fb3)
  48. """
  49. sandbox_plugins: list[PluginRequirement] = [
  50. # NOTE: AgentSkillsRequirement need to go before JupyterRequirement, since
  51. # AgentSkillsRequirement provides a lot of Python functions,
  52. # and it needs to be initialized before Jupyter for Jupyter to use those functions.
  53. AgentSkillsRequirement(),
  54. JupyterRequirement(),
  55. ]
  56. action_parser = CodeActResponseParser()
  57. def __init__(
  58. self,
  59. llm: LLM,
  60. config: AgentConfig,
  61. ) -> None:
  62. """Initializes a new instance of the CodeActAgent class.
  63. Parameters:
  64. - llm (LLM): The llm to be used by this agent
  65. """
  66. super().__init__(llm, config)
  67. self.reset()
  68. self.micro_agent = (
  69. MicroAgent(
  70. os.path.join(
  71. os.path.dirname(__file__), 'micro', f'{config.micro_agent_name}.md'
  72. )
  73. )
  74. if config.micro_agent_name
  75. else None
  76. )
  77. self.prompt_manager = PromptManager(
  78. prompt_dir=os.path.join(os.path.dirname(__file__)),
  79. agent_skills_docs=AgentSkillsRequirement.documentation,
  80. micro_agent=self.micro_agent,
  81. )
  82. def action_to_str(self, action: Action) -> str:
  83. if isinstance(action, CmdRunAction):
  84. return (
  85. f'{action.thought}\n<execute_bash>\n{action.command}\n</execute_bash>'
  86. )
  87. elif isinstance(action, IPythonRunCellAction):
  88. return f'{action.thought}\n<execute_ipython>\n{action.code}\n</execute_ipython>'
  89. elif isinstance(action, AgentDelegateAction):
  90. return f'{action.thought}\n<execute_browse>\n{action.inputs["task"]}\n</execute_browse>'
  91. elif isinstance(action, FileEditAction):
  92. return f'{action.thought}\n<file_edit path={action.path}>\n{action.content}\n</file_edit>'
  93. elif isinstance(action, MessageAction):
  94. return action.content
  95. elif isinstance(action, AgentFinishAction) and action.source == 'agent':
  96. return action.thought
  97. return ''
  98. def get_action_message(self, action: Action) -> Message | None:
  99. if (
  100. isinstance(action, AgentDelegateAction)
  101. or isinstance(action, CmdRunAction)
  102. or isinstance(action, IPythonRunCellAction)
  103. or isinstance(action, MessageAction)
  104. or isinstance(action, FileEditAction)
  105. or (isinstance(action, AgentFinishAction) and action.source == 'agent')
  106. ):
  107. content = [TextContent(text=self.action_to_str(action))]
  108. if (
  109. self.llm.vision_is_active()
  110. and isinstance(action, MessageAction)
  111. and action.images_urls
  112. ):
  113. content.append(ImageContent(image_urls=action.images_urls))
  114. return Message(
  115. role='user' if action.source == 'user' else 'assistant', content=content
  116. )
  117. return None
  118. def get_observation_message(self, obs: Observation) -> Message | None:
  119. max_message_chars = self.llm.config.max_message_chars
  120. obs_prefix = 'OBSERVATION:\n'
  121. if isinstance(obs, CmdOutputObservation):
  122. text = obs_prefix + truncate_content(obs.content, max_message_chars)
  123. text += (
  124. f'\n[Command {obs.command_id} finished with exit code {obs.exit_code}]'
  125. )
  126. return Message(role='user', content=[TextContent(text=text)])
  127. elif isinstance(obs, IPythonRunCellObservation):
  128. text = obs_prefix + obs.content
  129. # replace base64 images with a placeholder
  130. splitted = text.split('\n')
  131. for i, line in enumerate(splitted):
  132. if '![image](data:image/png;base64,' in line:
  133. splitted[i] = (
  134. '![image](data:image/png;base64, ...) already displayed to user'
  135. )
  136. text = '\n'.join(splitted)
  137. text = truncate_content(text, max_message_chars)
  138. return Message(role='user', content=[TextContent(text=text)])
  139. elif isinstance(obs, FileEditObservation):
  140. text = obs_prefix + truncate_content(str(obs), max_message_chars)
  141. return Message(role='user', content=[TextContent(text=text)])
  142. elif isinstance(obs, AgentDelegateObservation):
  143. text = obs_prefix + truncate_content(
  144. obs.outputs['content'] if 'content' in obs.outputs else '',
  145. max_message_chars,
  146. )
  147. return Message(role='user', content=[TextContent(text=text)])
  148. elif isinstance(obs, ErrorObservation):
  149. text = obs_prefix + truncate_content(obs.content, max_message_chars)
  150. text += '\n[Error occurred in processing last action]'
  151. return Message(role='user', content=[TextContent(text=text)])
  152. elif isinstance(obs, UserRejectObservation):
  153. text = 'OBSERVATION:\n' + truncate_content(obs.content, max_message_chars)
  154. text += '\n[Last action has been rejected by the user]'
  155. return Message(role='user', content=[TextContent(text=text)])
  156. else:
  157. # If an observation message is not returned, it will cause an error
  158. # when the LLM tries to return the next message
  159. raise ValueError(f'Unknown observation type: {type(obs)}')
  160. def reset(self) -> None:
  161. """Resets the CodeAct Agent."""
  162. super().reset()
  163. def step(self, state: State) -> Action:
  164. """Performs one step using the CodeAct Agent.
  165. This includes gathering info on previous steps and prompting the model to make a command to execute.
  166. Parameters:
  167. - state (State): used to get updated info
  168. Returns:
  169. - CmdRunAction(command) - bash command to run
  170. - IPythonRunCellAction(code) - IPython code to run
  171. - AgentDelegateAction(agent, inputs) - delegate action for (sub)task
  172. - MessageAction(content) - Message action to run (e.g. ask for clarification)
  173. - AgentFinishAction() - end the interaction
  174. """
  175. # if we're done, go back
  176. latest_user_message = state.history.get_last_user_message()
  177. if latest_user_message and latest_user_message.strip() == '/exit':
  178. return AgentFinishAction()
  179. # prepare what we want to send to the LLM
  180. messages = self._get_messages(state)
  181. params = {
  182. 'messages': self.llm.format_messages_for_llm(messages),
  183. 'stop': [
  184. '</execute_ipython>',
  185. '</execute_bash>',
  186. '</execute_browse>',
  187. '</file_edit>',
  188. ],
  189. }
  190. response = self.llm.completion(**params)
  191. return self.action_parser.parse(response)
  192. def _get_messages(self, state: State) -> list[Message]:
  193. messages: list[Message] = [
  194. Message(
  195. role='system',
  196. content=[
  197. TextContent(
  198. text=self.prompt_manager.system_message,
  199. cache_prompt=self.llm.is_caching_prompt_active(), # Cache system prompt
  200. )
  201. ],
  202. ),
  203. Message(
  204. role='user',
  205. content=[
  206. TextContent(
  207. text=self.prompt_manager.initial_user_message,
  208. cache_prompt=self.llm.is_caching_prompt_active(), # if the user asks the same query,
  209. )
  210. ],
  211. ),
  212. ]
  213. for event in state.history.get_events():
  214. # create a regular message from an event
  215. if isinstance(event, Action):
  216. message = self.get_action_message(event)
  217. elif isinstance(event, Observation):
  218. message = self.get_observation_message(event)
  219. else:
  220. raise ValueError(f'Unknown event type: {type(event)}')
  221. # add regular message
  222. if message:
  223. # handle error if the message is the SAME role as the previous message
  224. # litellm.exceptions.BadRequestError: litellm.BadRequestError: OpenAIException - Error code: 400 - {'detail': 'Only supports u/a/u/a/u...'}
  225. # there shouldn't be two consecutive messages from the same role
  226. if messages and messages[-1].role == message.role:
  227. messages[-1].content.extend(message.content)
  228. else:
  229. messages.append(message)
  230. # Add caching to the last 2 user messages
  231. if self.llm.is_caching_prompt_active():
  232. user_turns_processed = 0
  233. for message in reversed(messages):
  234. if message.role == 'user' and user_turns_processed < 2:
  235. message.content[
  236. -1
  237. ].cache_prompt = True # Last item inside the message content
  238. user_turns_processed += 1
  239. # The latest user message is important:
  240. # we want to remind the agent of the environment constraints
  241. latest_user_message = next(
  242. islice(
  243. (
  244. m
  245. for m in reversed(messages)
  246. if m.role == 'user'
  247. and any(isinstance(c, TextContent) for c in m.content)
  248. ),
  249. 1,
  250. ),
  251. None,
  252. )
  253. if latest_user_message:
  254. reminder_text = f'\n\nENVIRONMENT REMINDER: You have {state.max_iterations - state.iteration} turns left to complete the task. When finished reply with <finish></finish>.'
  255. latest_user_message.content.append(TextContent(text=reminder_text))
  256. return messages