swe_env_box.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. import json
  2. import os
  3. import sys
  4. import tempfile
  5. import uuid
  6. from datasets import load_dataset
  7. from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
  8. from swebench.harness.utils import get_test_directives
  9. from opendevin.core.config import AppConfig, SandboxConfig, load_app_config
  10. from opendevin.core.logger import opendevin_logger as logger
  11. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  12. from opendevin.runtime.plugins import (
  13. AgentSkillsRequirement,
  14. JupyterRequirement,
  15. PluginRequirement,
  16. )
  17. SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
  18. def get_image_name_from_instance_id(instance_id: str) -> str:
  19. return 'sweb.eval.x86_64.' + instance_id
  20. class SWEBenchSSHBox(DockerSSHBox):
  21. def __init__(
  22. self,
  23. config: AppConfig,
  24. container_image: str,
  25. timeout: int = 120,
  26. sid: str | None = None,
  27. swe_instance_id: str | None = None,
  28. swe_instance: dict | None = None,
  29. skip_workspace_mount: bool = True,
  30. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  31. workspace_dir_name: str | None = None,
  32. use_instance_image: bool = False,
  33. ):
  34. if swe_instance_id is None:
  35. raise ValueError('swe_instance_id must be provided!')
  36. self.swe_instance_id = swe_instance_id
  37. self.swe_instance = swe_instance
  38. self.skip_workspace_mount = skip_workspace_mount
  39. self.workspace_dir_name = workspace_dir_name
  40. assert (
  41. container_image is not None
  42. ), 'container_image is required for SWEBenchSSHBox!'
  43. # Need to run as root to use SWEBench container
  44. sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
  45. logger.info(f'===Using container image: {container_image}')
  46. super().__init__(
  47. config=SandboxConfig(container_image=container_image, timeout=timeout),
  48. persist_sandbox=config.persist_sandbox,
  49. workspace_mount_path=config.workspace_mount_path,
  50. sandbox_workspace_dir=config.workspace_mount_path_in_sandbox,
  51. cache_dir=config.cache_dir,
  52. run_as_devin=config.run_as_devin,
  53. ssh_hostname=config.ssh_hostname,
  54. ssh_password=config.ssh_password,
  55. ssh_port=config.ssh_port,
  56. sid=sid,
  57. )
  58. self.init_plugins(sandbox_plugins)
  59. exit_code, output = self.execute('mv ~/.bashrc ~/.bashrc.bak')
  60. assert exit_code == 0, f'Failed to backup ~/.bashrc: {output}'
  61. exit_code, output = self.execute(
  62. f"echo 'export SWE_INSTANCE_ID={self.swe_instance_id}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo \"alias git='git --no-pager'\" >> ~/.bashrc"
  63. )
  64. assert exit_code == 0, f'Failed to set SWE_INSTANCE_ID in ~/.bashrc: {output}'
  65. logger.info('Sourcing swe_entry.sh to set up environment variables')
  66. logger.info(
  67. 'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
  68. )
  69. logger.info(f'Use instance image: {use_instance_image}')
  70. if use_instance_image:
  71. # we directly inject the instance info into the container and the init script
  72. script_dir = os.path.dirname(__file__)
  73. # inject test command
  74. test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
  75. swe_instance['version']
  76. ]
  77. swe_instance['test_directives'] = get_test_directives(swe_instance)
  78. swe_instance['test_cmd'] = (
  79. f"{test_type} {' '.join(swe_instance['test_directives'])}"
  80. )
  81. exit_code, output = self.execute(
  82. f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
  83. )
  84. # assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'
  85. # inject the instance info
  86. self.execute('mkdir -p /swe_util/eval_data/instances')
  87. swe_instance_json_name = 'swe-bench-instance.json'
  88. with tempfile.TemporaryDirectory() as temp_dir:
  89. # Construct the full path for the desired file name within the temporary directory
  90. temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
  91. # Write to the file with the desired name within the temporary directory
  92. with open(temp_file_path, 'w') as f:
  93. if not isinstance(swe_instance, dict):
  94. json.dump([swe_instance.to_dict()], f)
  95. else:
  96. json.dump([swe_instance], f)
  97. # Copy the file to the desired location
  98. self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
  99. # inject the init script
  100. self.copy_to(
  101. str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
  102. '/swe_util/',
  103. )
  104. self.execute('cat ~/.bashrc')
  105. self.execute('source ~/.bashrc')
  106. self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
  107. logger.info('exit code: %d', exit_code)
  108. logger.info(output)
  109. assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
  110. logger.info('Sourced swe_entry.sh successfully')
  111. else:
  112. exit_code, output = self.execute(
  113. 'source /swe_util/swe_entry.sh', timeout=600
  114. )
  115. logger.info('exit code: %d', exit_code)
  116. logger.info(output)
  117. assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
  118. logger.info('Sourced swe_entry.sh successfully')
  119. @property
  120. def volumes(self):
  121. if self.skip_workspace_mount:
  122. return {
  123. k: v
  124. for k, v in super().volumes.items()
  125. if not v['bind'] == self.sandbox_workspace_dir
  126. }
  127. return super().volumes
  128. @classmethod
  129. def get_box_for_instance(
  130. cls,
  131. config: AppConfig,
  132. instance,
  133. workspace_dir_name=None,
  134. skip_workspace_mount: bool = True,
  135. workspace_mount_path: str | None = None,
  136. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  137. use_instance_image: bool = False,
  138. ) -> 'SWEBenchSSHBox':
  139. if workspace_dir_name is None:
  140. workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
  141. '/', '__'
  142. )
  143. old_workspace_base = config.workspace_base
  144. old_workspace_mount_path = config.workspace_mount_path
  145. try:
  146. config.workspace_base = workspace_mount_path
  147. config.workspace_mount_path = workspace_mount_path
  148. # linting python after editing helps LLM fix indentations
  149. config.sandbox.enable_auto_lint = True
  150. # Need to run as root to use SWEBench container
  151. config.run_as_devin = False
  152. if use_instance_image:
  153. container_image = get_image_name_from_instance_id(
  154. instance['instance_id']
  155. )
  156. else:
  157. container_image = SWE_BENCH_CONTAINER_IMAGE
  158. sandbox = cls(
  159. container_image=container_image,
  160. config=config,
  161. swe_instance_id=instance['instance_id'],
  162. swe_instance=instance,
  163. skip_workspace_mount=skip_workspace_mount,
  164. sandbox_plugins=sandbox_plugins,
  165. workspace_dir_name=workspace_dir_name,
  166. use_instance_image=use_instance_image,
  167. )
  168. logger.info(f"SSH box started for instance {instance['instance_id']}.")
  169. # cd to the repo
  170. exit_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
  171. if exit_code != 0:
  172. logger.error(f'Failed to cd to the repo: {output}')
  173. sys.exit(1)
  174. # remove all future commits & remote following Devin
  175. # https://www.cognition-labs.com/post/swe-bench-technical-report
  176. exit_code, output = sandbox.execute('git reset --hard')
  177. if exit_code != 0:
  178. logger.error(f'Failed to reset the repo: {output}')
  179. sys.exit(1)
  180. exit_code, output = sandbox.execute(
  181. 'for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
  182. )
  183. if exit_code != 0:
  184. logger.error(f'Failed to remove remote: {output}')
  185. sys.exit(1)
  186. except Exception:
  187. raise
  188. finally:
  189. # restore workspace_base and workspace_mount_path
  190. config.workspace_base = old_workspace_base
  191. config.workspace_mount_path = old_workspace_mount_path
  192. return sandbox
  193. def get_diff_patch(self):
  194. # add everything to the index
  195. exit_code, output = self.execute(f'cd /workspace/{self.workspace_dir_name}')
  196. if exit_code != 0:
  197. logger.error('Failed to cd to the repo')
  198. return ''
  199. exit_code, _output = self.execute('git config --global core.pager ""')
  200. if exit_code != 0:
  201. logger.error('Failed to change git config')
  202. return ''
  203. # add everything to the index
  204. exit_code, output = self.execute('git add -A')
  205. if exit_code != 0:
  206. logger.error('Failed to add everything to the index')
  207. return ''
  208. # get the git diff
  209. exit_code, git_patch = self.execute(
  210. f'git diff --no-color --cached {self.swe_instance["base_commit"]}'
  211. )
  212. if exit_code != 0:
  213. logger.error('Failed to get git diff')
  214. return ''
  215. return git_patch
  216. if __name__ == '__main__':
  217. config = load_app_config()
  218. # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
  219. # so we don't need to manage file uploading to OpenDevin's repo
  220. dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
  221. swe_bench_tests = dataset['test'].to_pandas()
  222. USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
  223. logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')
  224. # INSTANCE_ID = 'django__django-11099'
  225. INSTANCE_ID = 'astropy__astropy-12907'
  226. swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
  227. EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
  228. sandbox = SWEBenchSSHBox.get_box_for_instance(
  229. config=config,
  230. instance=EXAMPLE_INSTANCE,
  231. sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
  232. use_instance_image=USE_INSTANCE_IMAGE,
  233. )
  234. # PRE TEST
  235. exit_code, output = sandbox.execute('cd $REPO_PATH')
  236. assert exit_code == 0, 'Failed to cd $REPO_PATH'
  237. logger.info(f'cd $REPO_PATH: {output}')
  238. # apply test patch
  239. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
  240. assert exit_code == 0, 'Failed to apply test patch'
  241. logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
  242. # TEST
  243. exit_code, output = sandbox.execute('$TEST_CMD')
  244. assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
  245. logger.info(f'$TEST_CMD:\n{output}')
  246. # apply gold patch
  247. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
  248. logger.info('exit code: %d', exit_code)
  249. logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
  250. # TEST
  251. exit_code, output = sandbox.execute('$TEST_CMD')
  252. assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
  253. logger.info(f'$TEST_CMD:\n{output}')
  254. # Reset the repo
  255. exit_code, output = sandbox.execute('git reset --hard')
  256. assert exit_code == 0, 'Failed to reset the repo'
  257. logger.info(f'git reset --hard: {output}')
  258. sys.stdout.flush()
  259. try:
  260. while True:
  261. try:
  262. user_input = input('>>> ')
  263. except EOFError:
  264. logger.info('Exiting...')
  265. break
  266. if user_input.lower() == 'exit':
  267. logger.info('Exiting...')
  268. break
  269. exit_code, output = sandbox.execute(user_input)
  270. logger.info('exit code: %d', exit_code)
  271. logger.info(output)
  272. sys.stdout.flush()
  273. except KeyboardInterrupt:
  274. logger.info('Exiting...')
  275. sandbox.close()