function_calling.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. """This file contains the function calling implementation for different actions.
  2. This is similar to the functionality of `CodeActResponseParser`.
  3. """
  4. import json
  5. from browsergym.core.action.highlevel import HighLevelActionSet
  6. from litellm import (
  7. ChatCompletionToolParam,
  8. ChatCompletionToolParamFunctionChunk,
  9. ModelResponse,
  10. )
  11. from openhands.core.logger import openhands_logger as logger
  12. from openhands.events.action import (
  13. Action,
  14. AgentDelegateAction,
  15. AgentFinishAction,
  16. BrowseInteractiveAction,
  17. CmdRunAction,
  18. FileEditAction,
  19. IPythonRunCellAction,
  20. MessageAction,
  21. )
  22. from openhands.events.tool import ToolCallMetadata
  23. SYSTEM_PROMPT = """You are OpenHands agent, a helpful AI assistant that can interact with a computer to solve tasks.
  24. <IMPORTANT>
  25. * If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
  26. * When configuring git credentials, use "openhands" as the user.name and "openhands@all-hands.dev" as the user.email by default, unless explicitly instructed otherwise.
  27. </IMPORTANT>
  28. """
  29. _BASH_DESCRIPTION = """Execute a bash command in the terminal.
  30. * Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
  31. * Interactive: If a bash command returns exit code `-1`, this means the process is not yet finished. The assistant must then send a second call to terminal with an empty `command` (which will retrieve any additional logs), or it can send additional text (set `command` to the text) to STDIN of the running process, or it can send command=`ctrl+c` to interrupt the process.
  32. * Timeout: If a command execution result says "Command timed out. Sending SIGINT to the process", the assistant should retry running the command in the background.
  33. """
  34. CmdRunTool = ChatCompletionToolParam(
  35. type='function',
  36. function=ChatCompletionToolParamFunctionChunk(
  37. name='execute_bash',
  38. description=_BASH_DESCRIPTION,
  39. parameters={
  40. 'type': 'object',
  41. 'properties': {
  42. 'command': {
  43. 'type': 'string',
  44. 'description': 'The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.',
  45. },
  46. },
  47. 'required': ['command'],
  48. },
  49. ),
  50. )
  51. _IPYTHON_DESCRIPTION = """Run a cell of Python code in an IPython environment.
  52. * The assistant should define variables and import packages before using them.
  53. * The variable defined in the IPython environment will not be available outside the IPython environment (e.g., in terminal).
  54. """
  55. # We are not using agentskills's file_ops for viewing files now because StrReplaceEditorTool already supports viewing files
  56. # """* Apart from the standard Python library, the assistant can also use the following functions (already imported):
  57. # {AgentSkillsRequirement.documentation}"""
  58. IPythonTool = ChatCompletionToolParam(
  59. type='function',
  60. function=ChatCompletionToolParamFunctionChunk(
  61. name='execute_ipython_cell',
  62. description=_IPYTHON_DESCRIPTION,
  63. parameters={
  64. 'type': 'object',
  65. 'properties': {
  66. 'code': {
  67. 'type': 'string',
  68. 'description': 'The Python code to execute. Supports magic commands like %pip.',
  69. },
  70. },
  71. 'required': ['code'],
  72. },
  73. ),
  74. )
  75. _FILE_EDIT_DESCRIPTION = """Edit a file.
  76. * The assistant can edit files by specifying the file path and providing a draft of the new file content.
  77. * The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
  78. * IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
  79. * To append to a file, set both `start` and `end` to `-1`.
  80. * If the file doesn't exist, a new file will be created with the provided content.
  81. **Example 1: general edit for short files**
  82. For example, given an existing file `/path/to/file.py` that looks like this:
  83. (this is the end of the file)
  84. 1|class MyClass:
  85. 2| def __init__(self):
  86. 3| self.x = 1
  87. 4| self.y = 2
  88. 5| self.z = 3
  89. 6|
  90. 7|print(MyClass().z)
  91. 8|print(MyClass().x)
  92. (this is the end of the file)
  93. The assistant wants to edit the file to look like this:
  94. (this is the end of the file)
  95. 1|class MyClass:
  96. 2| def __init__(self):
  97. 3| self.x = 1
  98. 4| self.y = 2
  99. 5|
  100. 6|print(MyClass().y)
  101. (this is the end of the file)
  102. The assistant may produce an edit action like this:
  103. path="/path/to/file.txt" start=1 end=-1
  104. content=```
  105. class MyClass:
  106. def __init__(self):
  107. # no changes before
  108. self.y = 2
  109. # self.z is removed
  110. # MyClass().z is removed
  111. print(MyClass().y)
  112. ```
  113. **Example 2: append to file for short files**
  114. For example, given an existing file `/path/to/file.py` that looks like this:
  115. (this is the end of the file)
  116. 1|class MyClass:
  117. 2| def __init__(self):
  118. 3| self.x = 1
  119. 4| self.y = 2
  120. 5| self.z = 3
  121. 6|
  122. 7|print(MyClass().z)
  123. 8|print(MyClass().x)
  124. (this is the end of the file)
  125. To append the following lines to the file:
  126. ```python
  127. print(MyClass().y)
  128. ```
  129. The assistant may produce an edit action like this:
  130. path="/path/to/file.txt" start=-1 end=-1
  131. content=```
  132. print(MyClass().y)
  133. ```
  134. **Example 3: edit for long files**
  135. Given an existing file `/path/to/file.py` that looks like this:
  136. (1000 more lines above)
  137. 1001|class MyClass:
  138. 1002| def __init__(self):
  139. 1003| self.x = 1
  140. 1004| self.y = 2
  141. 1005| self.z = 3
  142. 1006|
  143. 1007|print(MyClass().z)
  144. 1008|print(MyClass().x)
  145. (2000 more lines below)
  146. The assistant wants to edit the file to look like this:
  147. (1000 more lines above)
  148. 1001|class MyClass:
  149. 1002| def __init__(self):
  150. 1003| self.x = 1
  151. 1004| self.y = 2
  152. 1005|
  153. 1006|print(MyClass().y)
  154. (2000 more lines below)
  155. The assistant may produce an edit action like this:
  156. path="/path/to/file.txt" start=1001 end=1008
  157. content=```
  158. class MyClass:
  159. def __init__(self):
  160. # no changes before
  161. self.y = 2
  162. # self.z is removed
  163. # MyClass().z is removed
  164. print(MyClass().y)
  165. ```
  166. """
  167. LLMBasedFileEditTool = ChatCompletionToolParam(
  168. type='function',
  169. function=ChatCompletionToolParamFunctionChunk(
  170. name='edit_file',
  171. description=_FILE_EDIT_DESCRIPTION,
  172. parameters={
  173. 'type': 'object',
  174. 'properties': {
  175. 'path': {
  176. 'type': 'string',
  177. 'description': 'The absolute path to the file to be edited.',
  178. },
  179. 'new_content_draft': {
  180. 'type': 'string',
  181. 'description': 'A draft of the new content for the file being edited. Note that the assistant may skip unchanged lines.',
  182. },
  183. 'start': {
  184. 'type': 'integer',
  185. 'description': 'The starting line number for the edit (1-indexed, inclusive). Default is 1.',
  186. },
  187. 'end': {
  188. 'type': 'integer',
  189. 'description': 'The ending line number for the edit (1-indexed, inclusive). Default is -1 (end of file).',
  190. },
  191. },
  192. 'required': ['path', 'content'],
  193. },
  194. ),
  195. )
  196. _STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files
  197. * State is persistent across command calls and discussions with the user
  198. * If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
  199. * The `create` command cannot be used if the specified `path` already exists as a file
  200. * If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
  201. * The `undo_edit` command will revert the last edit made to the file at `path`
  202. Notes for using the `str_replace` command:
  203. * The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
  204. * If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
  205. * The `new_str` parameter should contain the edited lines that should replace the `old_str`
  206. """
  207. StrReplaceEditorTool = ChatCompletionToolParam(
  208. type='function',
  209. function=ChatCompletionToolParamFunctionChunk(
  210. name='str_replace_editor',
  211. description=_STR_REPLACE_EDITOR_DESCRIPTION,
  212. parameters={
  213. 'type': 'object',
  214. 'properties': {
  215. 'command': {
  216. 'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
  217. 'enum': ['view', 'create', 'str_replace', 'insert', 'undo_edit'],
  218. 'type': 'string',
  219. },
  220. 'path': {
  221. 'description': 'Absolute path to file or directory, e.g. `/repo/file.py` or `/repo`.',
  222. 'type': 'string',
  223. },
  224. 'file_text': {
  225. 'description': 'Required parameter of `create` command, with the content of the file to be created.',
  226. 'type': 'string',
  227. },
  228. 'old_str': {
  229. 'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
  230. 'type': 'string',
  231. },
  232. 'new_str': {
  233. 'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
  234. 'type': 'string',
  235. },
  236. 'insert_line': {
  237. 'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
  238. 'type': 'integer',
  239. },
  240. 'view_range': {
  241. 'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
  242. 'items': {'type': 'integer'},
  243. 'type': 'array',
  244. },
  245. },
  246. 'required': ['command', 'path'],
  247. },
  248. ),
  249. )
  250. # from browsergym/core/action/highlevel.py
  251. _browser_action_space = HighLevelActionSet(
  252. subsets=['bid', 'nav'],
  253. strict=False, # less strict on the parsing of the actions
  254. multiaction=True, # enable to agent to take multiple actions at once
  255. )
  256. _BROWSER_DESCRIPTION = """Interact with the browser using Python code.
  257. The following 15 functions are available. Nothing else is supported.
  258. goto(url: str)
  259. Description: Navigate to a url.
  260. Examples:
  261. goto('http://www.example.com')
  262. go_back()
  263. Description: Navigate to the previous page in history.
  264. Examples:
  265. go_back()
  266. go_forward()
  267. Description: Navigate to the next page in history.
  268. Examples:
  269. go_forward()
  270. noop(wait_ms: float = 1000)
  271. Description: Do nothing, and optionally wait for the given time (in milliseconds).
  272. You can use this to get the current page content and/or wait for the page to load.
  273. Examples:
  274. noop()
  275. noop(500)
  276. scroll(delta_x: float, delta_y: float)
  277. Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
  278. Examples:
  279. scroll(0, 200)
  280. scroll(-50.2, -100.5)
  281. fill(bid: str, value: str)
  282. Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
  283. Examples:
  284. fill('237', 'example value')
  285. fill('45', 'multi-line\nexample')
  286. fill('a12', 'example with "quotes"')
  287. select_option(bid: str, options: str | list[str])
  288. Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
  289. Examples:
  290. select_option('a48', 'blue')
  291. select_option('c48', ['red', 'green', 'blue'])
  292. click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
  293. Description: Click an element.
  294. Examples:
  295. click('a51')
  296. click('b22', button='right')
  297. click('48', button='middle', modifiers=['Shift'])
  298. dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
  299. Description: Double click an element.
  300. Examples:
  301. dblclick('12')
  302. dblclick('ca42', button='right')
  303. dblclick('178', button='middle', modifiers=['Shift'])
  304. hover(bid: str)
  305. Description: Hover over an element.
  306. Examples:
  307. hover('b8')
  308. press(bid: str, key_comb: str)
  309. Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
  310. Examples:
  311. press('88', 'Backspace')
  312. press('a26', 'ControlOrMeta+a')
  313. press('a61', 'Meta+Shift+t')
  314. focus(bid: str)
  315. Description: Focus the matching element.
  316. Examples:
  317. focus('b455')
  318. clear(bid: str)
  319. Description: Clear the input field.
  320. Examples:
  321. clear('996')
  322. drag_and_drop(from_bid: str, to_bid: str)
  323. Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
  324. Examples:
  325. drag_and_drop('56', '498')
  326. upload_file(bid: str, file: str | list[str])
  327. Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
  328. Examples:
  329. upload_file('572', '/home/user/my_receipt.pdf')
  330. upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
  331. Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
  332. More than 2-3 actions usually leads to failure or unexpected behavior. Example:
  333. fill('a12', 'example with "quotes"')
  334. click('a51')
  335. click('48', button='middle', modifiers=['Shift'])
  336. """
  337. for _, action in _browser_action_space.action_set.items():
  338. assert (
  339. action.signature in _BROWSER_DESCRIPTION
  340. ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
  341. assert (
  342. action.description in _BROWSER_DESCRIPTION
  343. ), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'
  344. BrowserTool = ChatCompletionToolParam(
  345. type='function',
  346. function=ChatCompletionToolParamFunctionChunk(
  347. name='browser',
  348. description=_BROWSER_DESCRIPTION,
  349. parameters={
  350. 'type': 'object',
  351. 'properties': {
  352. 'code': {
  353. 'type': 'string',
  354. 'description': 'The Python code that interacts with the browser.',
  355. }
  356. },
  357. 'required': ['code'],
  358. },
  359. ),
  360. )
  361. _FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task."""
  362. FinishTool = ChatCompletionToolParam(
  363. type='function',
  364. function=ChatCompletionToolParamFunctionChunk(
  365. name='finish',
  366. description=_FINISH_DESCRIPTION,
  367. ),
  368. )
  369. def combine_thought(action: Action, thought: str) -> Action:
  370. if not hasattr(action, 'thought'):
  371. return action
  372. if thought:
  373. action.thought = thought
  374. return action
  375. def response_to_actions(response: ModelResponse) -> list[Action]:
  376. actions: list[Action] = []
  377. assert len(response.choices) == 1, 'Only one choice is supported for now'
  378. assistant_msg = response.choices[0].message
  379. if assistant_msg.tool_calls:
  380. # Check if there's assistant_msg.content. If so, add it to the thought
  381. thought = ''
  382. if isinstance(assistant_msg.content, str):
  383. thought = assistant_msg.content
  384. elif isinstance(assistant_msg.content, list):
  385. for msg in assistant_msg.content:
  386. if msg['type'] == 'text':
  387. thought += msg['text']
  388. # Process each tool call to OpenHands action
  389. for i, tool_call in enumerate(assistant_msg.tool_calls):
  390. action: Action
  391. try:
  392. arguments = json.loads(tool_call.function.arguments)
  393. except json.decoder.JSONDecodeError as e:
  394. raise RuntimeError(
  395. f'Failed to parse tool call arguments: {tool_call.function.arguments}'
  396. ) from e
  397. if tool_call.function.name == 'execute_bash':
  398. action = CmdRunAction(**arguments)
  399. elif tool_call.function.name == 'execute_ipython_cell':
  400. action = IPythonRunCellAction(**arguments)
  401. elif tool_call.function.name == 'delegate_to_browsing_agent':
  402. action = AgentDelegateAction(
  403. agent='BrowsingAgent',
  404. inputs=arguments,
  405. )
  406. elif tool_call.function.name == 'finish':
  407. action = AgentFinishAction()
  408. elif tool_call.function.name == 'edit_file':
  409. action = FileEditAction(**arguments)
  410. elif tool_call.function.name == 'str_replace_editor':
  411. # We implement this in agent_skills, which can be used via Jupyter
  412. # convert tool_call.function.arguments to kwargs that can be passed to file_editor
  413. code = f'print(file_editor(**{arguments}))'
  414. logger.debug(
  415. f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
  416. )
  417. action = IPythonRunCellAction(code=code, include_extra=False)
  418. elif tool_call.function.name == 'browser':
  419. action = BrowseInteractiveAction(browser_actions=arguments['code'])
  420. else:
  421. raise RuntimeError(f'Unknown tool call: {tool_call.function.name}')
  422. # We only add thought to the first action
  423. if i == 0:
  424. action = combine_thought(action, thought)
  425. # Add metadata for tool calling
  426. action.tool_call_metadata = ToolCallMetadata(
  427. tool_call_id=tool_call.id,
  428. function_name=tool_call.function.name,
  429. model_response=response,
  430. total_calls_in_response=len(assistant_msg.tool_calls),
  431. )
  432. actions.append(action)
  433. else:
  434. actions.append(
  435. MessageAction(content=assistant_msg.content, wait_for_response=True)
  436. )
  437. assert len(actions) >= 1
  438. return actions
  439. def get_tools(
  440. codeact_enable_browsing: bool = False,
  441. codeact_enable_llm_editor: bool = False,
  442. codeact_enable_jupyter: bool = False,
  443. ) -> list[ChatCompletionToolParam]:
  444. tools = [CmdRunTool, FinishTool]
  445. if codeact_enable_browsing:
  446. tools.append(BrowserTool)
  447. if codeact_enable_jupyter:
  448. tools.append(IPythonTool)
  449. if codeact_enable_llm_editor:
  450. tools.append(LLMBasedFileEditTool)
  451. else:
  452. tools.append(StrReplaceEditorTool)
  453. return tools