runtime.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. import asyncio
  2. import os
  3. import tempfile
  4. import threading
  5. import uuid
  6. from zipfile import ZipFile
  7. import aiohttp
  8. import docker
  9. import tenacity
  10. from openhands.core.config import AppConfig
  11. from openhands.core.logger import openhands_logger as logger
  12. from openhands.events import EventStream
  13. from openhands.events.action import (
  14. BrowseInteractiveAction,
  15. BrowseURLAction,
  16. CmdRunAction,
  17. FileReadAction,
  18. FileWriteAction,
  19. IPythonRunCellAction,
  20. )
  21. from openhands.events.action.action import Action
  22. from openhands.events.observation import (
  23. ErrorObservation,
  24. NullObservation,
  25. Observation,
  26. )
  27. from openhands.events.serialization import event_to_dict, observation_from_dict
  28. from openhands.events.serialization.action import ACTION_TYPE_TO_CLASS
  29. from openhands.runtime.builder import DockerRuntimeBuilder
  30. from openhands.runtime.plugins import PluginRequirement
  31. from openhands.runtime.runtime import Runtime
  32. from openhands.runtime.utils import find_available_tcp_port
  33. from openhands.runtime.utils.runtime_build import build_runtime_image
  34. class LogBuffer:
  35. """
  36. Synchronous buffer for Docker container logs.
  37. This class provides a thread-safe way to collect, store, and retrieve logs
  38. from a Docker container. It uses a list to store log lines and provides methods
  39. for appending, retrieving, and clearing logs.
  40. """
  41. def __init__(self, container: docker.models.containers.Container):
  42. self.buffer: list[str] = []
  43. self.lock = threading.Lock()
  44. self.log_generator = container.logs(stream=True, follow=True)
  45. self.log_stream_thread = threading.Thread(target=self.stream_logs)
  46. self.log_stream_thread.daemon = True
  47. self.log_stream_thread.start()
  48. self._stop_event = threading.Event()
  49. def append(self, log_line: str):
  50. with self.lock:
  51. self.buffer.append(log_line)
  52. def get_and_clear(self) -> list[str]:
  53. with self.lock:
  54. logs = list(self.buffer)
  55. self.buffer.clear()
  56. return logs
  57. def stream_logs(self):
  58. """
  59. Stream logs from the Docker container in a separate thread.
  60. This method runs in its own thread to handle the blocking
  61. operation of reading log lines from the Docker SDK's synchronous generator.
  62. """
  63. try:
  64. for log_line in self.log_generator:
  65. if self._stop_event.is_set():
  66. break
  67. if log_line:
  68. self.append(log_line.decode('utf-8').rstrip())
  69. except Exception as e:
  70. logger.error(f'Error in stream_logs: {e}')
  71. def __del__(self):
  72. if self.log_stream_thread.is_alive():
  73. logger.warn(
  74. "LogBuffer was not properly closed. Use 'log_buffer.close()' for clean shutdown."
  75. )
  76. self.close(timeout=5)
  77. def close(self, timeout: float = 10.0):
  78. self._stop_event.set()
  79. self.log_stream_thread.join(timeout)
  80. class EventStreamRuntime(Runtime):
  81. """This runtime will subscribe the event stream.
  82. When receive an event, it will send the event to runtime-client which run inside the docker environment.
  83. """
  84. container_name_prefix = 'openhands-sandbox-'
  85. def __init__(
  86. self,
  87. config: AppConfig,
  88. event_stream: EventStream,
  89. sid: str = 'default',
  90. plugins: list[PluginRequirement] | None = None,
  91. ):
  92. super().__init__(
  93. config, event_stream, sid, plugins
  94. ) # will initialize the event stream
  95. self._port = find_available_tcp_port()
  96. self.api_url = f'http://{self.config.sandbox.api_hostname}:{self._port}'
  97. self.session: aiohttp.ClientSession | None = None
  98. self.instance_id = (
  99. sid + '_' + str(uuid.uuid4()) if sid is not None else str(uuid.uuid4())
  100. )
  101. # TODO: We can switch to aiodocker when `build_sandbox_image` is updated to use aiodocker
  102. self.docker_client: docker.DockerClient = self._init_docker_client()
  103. self.base_container_image = self.config.sandbox.base_container_image
  104. self.runtime_container_image = self.config.sandbox.runtime_container_image
  105. self.container_name = self.container_name_prefix + self.instance_id
  106. self.container = None
  107. self.action_semaphore = asyncio.Semaphore(1) # Ensure one action at a time
  108. self.runtime_builder = DockerRuntimeBuilder(self.docker_client)
  109. logger.debug(f'EventStreamRuntime `{sid}` config:\n{self.config}')
  110. # Buffer for container logs
  111. self.log_buffer: LogBuffer | None = None
  112. self.startup_done = False
  113. async def ainit(self, env_vars: dict[str, str] | None = None):
  114. if self.config.sandbox.runtime_extra_deps:
  115. logger.info(
  116. f'Installing extra user-provided dependencies in the runtime image: {self.config.sandbox.runtime_extra_deps}'
  117. )
  118. if self.runtime_container_image is None:
  119. if self.base_container_image is None:
  120. raise ValueError(
  121. 'Neither runtime container image nor base container image is set'
  122. )
  123. self.runtime_container_image = build_runtime_image(
  124. self.base_container_image,
  125. self.runtime_builder,
  126. extra_deps=self.config.sandbox.runtime_extra_deps,
  127. )
  128. self.container = await self._init_container(
  129. self.sandbox_workspace_dir,
  130. mount_dir=self.config.workspace_mount_path,
  131. plugins=self.plugins,
  132. )
  133. # MUST call super().ainit() to initialize both default env vars
  134. # AND the ones in env vars!
  135. await super().ainit(env_vars)
  136. logger.info(
  137. f'Container initialized with plugins: {[plugin.name for plugin in self.plugins]}'
  138. )
  139. logger.info(f'Container initialized with env vars: {env_vars}')
  140. @staticmethod
  141. def _init_docker_client() -> docker.DockerClient:
  142. try:
  143. return docker.from_env()
  144. except Exception as ex:
  145. logger.error(
  146. 'Launch docker client failed. Please make sure you have installed docker and started the docker daemon.'
  147. )
  148. raise ex
  149. @tenacity.retry(
  150. stop=tenacity.stop_after_attempt(5),
  151. wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
  152. )
  153. async def _init_container(
  154. self,
  155. sandbox_workspace_dir: str,
  156. mount_dir: str | None = None,
  157. plugins: list[PluginRequirement] | None = None,
  158. ):
  159. try:
  160. logger.info(
  161. f'Starting container with image: {self.runtime_container_image} and name: {self.container_name}'
  162. )
  163. plugin_arg = ''
  164. if plugins is not None and len(plugins) > 0:
  165. plugin_arg = (
  166. f'--plugins {" ".join([plugin.name for plugin in plugins])} '
  167. )
  168. network_mode: str | None = None
  169. port_mapping: dict[str, int] | None = None
  170. if self.config.sandbox.use_host_network:
  171. network_mode = 'host'
  172. logger.warn(
  173. 'Using host network mode. If you are using MacOS, please make sure you have the latest version of Docker Desktop and enabled host network feature: https://docs.docker.com/network/drivers/host/#docker-desktop'
  174. )
  175. else:
  176. port_mapping = {f'{self._port}/tcp': self._port}
  177. if mount_dir is not None:
  178. volumes = {mount_dir: {'bind': sandbox_workspace_dir, 'mode': 'rw'}}
  179. logger.info(f'Mount dir: {sandbox_workspace_dir}')
  180. else:
  181. logger.warn(
  182. 'Mount dir is not set, will not mount the workspace directory to the container.'
  183. )
  184. volumes = None
  185. if self.config.sandbox.browsergym_eval_env is not None:
  186. browsergym_arg = (
  187. f'--browsergym-eval-env {self.config.sandbox.browsergym_eval_env}'
  188. )
  189. else:
  190. browsergym_arg = ''
  191. container = self.docker_client.containers.run(
  192. self.runtime_container_image,
  193. command=(
  194. f'/openhands/miniforge3/bin/mamba run --no-capture-output -n base '
  195. 'PYTHONUNBUFFERED=1 poetry run '
  196. f'python -u -m openhands.runtime.client.client {self._port} '
  197. f'--working-dir {sandbox_workspace_dir} '
  198. f'{plugin_arg}'
  199. f'--username {"openhands" if self.config.run_as_openhands else "root"} '
  200. f'--user-id {self.config.sandbox.user_id} '
  201. f'{browsergym_arg}'
  202. ),
  203. network_mode=network_mode,
  204. ports=port_mapping,
  205. working_dir='/openhands/code/',
  206. name=self.container_name,
  207. detach=True,
  208. environment={'DEBUG': 'true'} if self.config.debug else None,
  209. volumes=volumes,
  210. )
  211. self.log_buffer = LogBuffer(container)
  212. logger.info(f'Container started. Server url: {self.api_url}')
  213. return container
  214. except Exception as e:
  215. logger.error('Failed to start container')
  216. logger.exception(e)
  217. await self.close(close_client=False)
  218. raise e
  219. async def _ensure_session(self):
  220. if self.session is None or self.session.closed:
  221. self.session = aiohttp.ClientSession()
  222. return self.session
  223. @tenacity.retry(
  224. stop=tenacity.stop_after_attempt(10),
  225. wait=tenacity.wait_exponential(multiplier=2, min=10, max=60),
  226. )
  227. async def _wait_until_alive(self):
  228. init_msg = 'Runtime client initialized.'
  229. logger.debug('Getting container logs...')
  230. # Print and clear the log buffer
  231. assert (
  232. self.log_buffer is not None
  233. ), 'Log buffer is expected to be initialized when container is started'
  234. # Always process logs, regardless of startup_done status
  235. logs = self.log_buffer.get_and_clear()
  236. if logs:
  237. formatted_logs = '\n'.join([f' |{log}' for log in logs])
  238. logger.info(
  239. '\n'
  240. + '-' * 30
  241. + 'Container logs:'
  242. + '-' * 30
  243. + f'\n{formatted_logs}'
  244. + '\n'
  245. + '-' * 90
  246. )
  247. # Check for initialization message even if startup_done is True
  248. if any(init_msg in log for log in logs):
  249. self.startup_done = True
  250. if not self.startup_done:
  251. attempts = 0
  252. while not self.startup_done and attempts < 10:
  253. attempts += 1
  254. await asyncio.sleep(1)
  255. logs = self.log_buffer.get_and_clear()
  256. if logs:
  257. formatted_logs = '\n'.join([f' |{log}' for log in logs])
  258. logger.info(
  259. '\n'
  260. + '-' * 30
  261. + 'Container logs:'
  262. + '-' * 30
  263. + f'\n{formatted_logs}'
  264. + '\n'
  265. + '-' * 90
  266. )
  267. if any(init_msg in log for log in logs):
  268. self.startup_done = True
  269. break
  270. async with aiohttp.ClientSession() as session:
  271. async with session.get(f'{self.api_url}/alive') as response:
  272. if response.status == 200:
  273. return
  274. else:
  275. msg = f'Action execution API is not alive. Response: {response}'
  276. logger.error(msg)
  277. raise RuntimeError(msg)
  278. @property
  279. def sandbox_workspace_dir(self):
  280. return self.config.workspace_mount_path_in_sandbox
  281. async def close(self, close_client: bool = True):
  282. if self.log_buffer:
  283. self.log_buffer.close()
  284. if self.session is not None and not self.session.closed:
  285. await self.session.close()
  286. containers = self.docker_client.containers.list(all=True)
  287. for container in containers:
  288. try:
  289. if container.name.startswith(self.container_name_prefix):
  290. logs = container.logs(tail=1000).decode('utf-8')
  291. logger.debug(
  292. f'==== Container logs ====\n{logs}\n==== End of container logs ===='
  293. )
  294. container.remove(force=True)
  295. except docker.errors.NotFound:
  296. pass
  297. if close_client:
  298. self.docker_client.close()
  299. async def run_action(self, action: Action) -> Observation:
  300. # set timeout to default if not set
  301. if action.timeout is None:
  302. action.timeout = self.config.sandbox.timeout
  303. async with self.action_semaphore:
  304. if not action.runnable:
  305. return NullObservation('')
  306. action_type = action.action # type: ignore[attr-defined]
  307. if action_type not in ACTION_TYPE_TO_CLASS:
  308. return ErrorObservation(f'Action {action_type} does not exist.')
  309. if not hasattr(self, action_type):
  310. return ErrorObservation(
  311. f'Action {action_type} is not supported in the current runtime.'
  312. )
  313. logger.info('Awaiting session')
  314. session = await self._ensure_session()
  315. await self._wait_until_alive()
  316. assert action.timeout is not None
  317. try:
  318. logger.info('Executing command')
  319. async with session.post(
  320. f'{self.api_url}/execute_action',
  321. json={'action': event_to_dict(action)},
  322. timeout=action.timeout,
  323. ) as response:
  324. if response.status == 200:
  325. output = await response.json()
  326. obs = observation_from_dict(output)
  327. obs._cause = action.id # type: ignore[attr-defined]
  328. return obs
  329. else:
  330. error_message = await response.text()
  331. logger.error(f'Error from server: {error_message}')
  332. obs = ErrorObservation(
  333. f'Command execution failed: {error_message}'
  334. )
  335. except asyncio.TimeoutError:
  336. logger.error('No response received within the timeout period.')
  337. obs = ErrorObservation('Command execution timed out')
  338. except Exception as e:
  339. logger.error(f'Error during command execution: {e}')
  340. obs = ErrorObservation(f'Command execution failed: {str(e)}')
  341. return obs
  342. async def run(self, action: CmdRunAction) -> Observation:
  343. return await self.run_action(action)
  344. async def run_ipython(self, action: IPythonRunCellAction) -> Observation:
  345. return await self.run_action(action)
  346. async def read(self, action: FileReadAction) -> Observation:
  347. return await self.run_action(action)
  348. async def write(self, action: FileWriteAction) -> Observation:
  349. return await self.run_action(action)
  350. async def browse(self, action: BrowseURLAction) -> Observation:
  351. return await self.run_action(action)
  352. async def browse_interactive(self, action: BrowseInteractiveAction) -> Observation:
  353. return await self.run_action(action)
  354. # ====================================================================
  355. # Implement these methods (for file operations) in the subclass
  356. # ====================================================================
  357. async def copy_to(
  358. self, host_src: str, sandbox_dest: str, recursive: bool = False
  359. ) -> None:
  360. if not os.path.exists(host_src):
  361. raise FileNotFoundError(f'Source file {host_src} does not exist')
  362. session = await self._ensure_session()
  363. await self._wait_until_alive()
  364. try:
  365. if recursive:
  366. # For recursive copy, create a zip file
  367. with tempfile.NamedTemporaryFile(
  368. suffix='.zip', delete=False
  369. ) as temp_zip:
  370. temp_zip_path = temp_zip.name
  371. with ZipFile(temp_zip_path, 'w') as zipf:
  372. for root, _, files in os.walk(host_src):
  373. for file in files:
  374. file_path = os.path.join(root, file)
  375. arcname = os.path.relpath(
  376. file_path, os.path.dirname(host_src)
  377. )
  378. zipf.write(file_path, arcname)
  379. upload_data = {'file': open(temp_zip_path, 'rb')}
  380. else:
  381. # For single file copy
  382. upload_data = {'file': open(host_src, 'rb')}
  383. params = {'destination': sandbox_dest, 'recursive': str(recursive).lower()}
  384. async with session.post(
  385. f'{self.api_url}/upload_file', data=upload_data, params=params
  386. ) as response:
  387. if response.status == 200:
  388. return
  389. else:
  390. error_message = await response.text()
  391. raise Exception(f'Copy operation failed: {error_message}')
  392. except asyncio.TimeoutError:
  393. raise TimeoutError('Copy operation timed out')
  394. except Exception as e:
  395. raise RuntimeError(f'Copy operation failed: {str(e)}')
  396. finally:
  397. if recursive:
  398. os.unlink(temp_zip_path)
  399. logger.info(f'Copy completed: host:{host_src} -> runtime:{sandbox_dest}')
  400. async def list_files(self, path: str | None = None) -> list[str]:
  401. """List files in the sandbox.
  402. If path is None, list files in the sandbox's initial working directory (e.g., /workspace).
  403. """
  404. session = await self._ensure_session()
  405. await self._wait_until_alive()
  406. try:
  407. data = {}
  408. if path is not None:
  409. data['path'] = path
  410. async with session.post(
  411. f'{self.api_url}/list_files', json=data
  412. ) as response:
  413. if response.status == 200:
  414. response_json = await response.json()
  415. assert isinstance(response_json, list)
  416. return response_json
  417. else:
  418. error_message = await response.text()
  419. raise Exception(f'List files operation failed: {error_message}')
  420. except asyncio.TimeoutError:
  421. raise TimeoutError('List files operation timed out')
  422. except Exception as e:
  423. raise RuntimeError(f'List files operation failed: {str(e)}')