swe_env_box.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import sys
  2. import uuid
  3. from datasets import load_dataset
  4. from opendevin.core.config import config
  5. from opendevin.core.logger import opendevin_logger as logger
  6. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  7. from opendevin.runtime.plugins import (
  8. AgentSkillsRequirement,
  9. JupyterRequirement,
  10. PluginRequirement,
  11. )
  12. SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
  13. class SWEBenchSSHBox(DockerSSHBox):
  14. def __init__(
  15. self,
  16. container_image: str,
  17. timeout: int = 120,
  18. sid: str | None = None,
  19. swe_instance_id: str | None = None,
  20. swe_instance: dict | None = None,
  21. skip_workspace_mount: bool = True,
  22. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  23. workspace_dir_name: str | None = None,
  24. ):
  25. if swe_instance_id is None:
  26. raise ValueError('swe_instance_id must be provided!')
  27. self.swe_instance_id = swe_instance_id
  28. self.swe_instance = swe_instance
  29. self.skip_workspace_mount = skip_workspace_mount
  30. self.workspace_dir_name = workspace_dir_name
  31. assert (
  32. container_image is not None
  33. ), 'container_image is required for SWEBenchSSHBox!'
  34. # Need to run as root to use SWEBench container
  35. sid = f'swe_bench_{swe_instance_id}' + str(uuid.uuid4())
  36. super().__init__(container_image, timeout, sid)
  37. self.init_plugins(sandbox_plugins)
  38. exit_code, output = self.execute('mv ~/.bashrc ~/.bashrc.bak')
  39. assert exit_code == 0, f'Failed to backup ~/.bashrc: {output}'
  40. exit_code, output = self.execute(
  41. f"echo 'export SWE_INSTANCE_ID={self.swe_instance_id}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo \"alias git='git --no-pager'\" >> ~/.bashrc"
  42. )
  43. assert exit_code == 0, f'Failed to set SWE_INSTANCE_ID in ~/.bashrc: {output}'
  44. logger.info('Sourcing swe_entry.sh to set up environment variables')
  45. logger.info(
  46. 'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
  47. )
  48. exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
  49. logger.info('exit code: %d', exit_code)
  50. logger.info(output)
  51. assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
  52. logger.info('Sourced swe_entry.sh successfully')
  53. @property
  54. def volumes(self):
  55. if self.skip_workspace_mount:
  56. return {
  57. k: v
  58. for k, v in super().volumes.items()
  59. if not v['bind'] == self.sandbox_workspace_dir
  60. }
  61. return super().volumes
  62. @classmethod
  63. def get_box_for_instance(
  64. cls,
  65. instance,
  66. workspace_dir_name=None,
  67. skip_workspace_mount: bool = True,
  68. workspace_mount_path: str | None = None,
  69. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  70. ) -> 'SWEBenchSSHBox':
  71. if workspace_dir_name is None:
  72. workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
  73. '/', '__'
  74. )
  75. old_workspace_base = config.workspace_base
  76. old_workspace_mount_path = config.workspace_mount_path
  77. try:
  78. config.workspace_base = workspace_mount_path
  79. config.workspace_mount_path = workspace_mount_path
  80. # linting python after editing helps LLM fix indentations
  81. config.enable_auto_lint = True
  82. # Need to run as root to use SWEBench container
  83. config.run_as_devin = False
  84. sandbox = cls(
  85. container_image=SWE_BENCH_CONTAINER_IMAGE,
  86. swe_instance_id=instance['instance_id'],
  87. swe_instance=instance,
  88. skip_workspace_mount=skip_workspace_mount,
  89. sandbox_plugins=sandbox_plugins,
  90. workspace_dir_name=workspace_dir_name,
  91. )
  92. logger.info(f"SSH box started for instance {instance['instance_id']}.")
  93. # cd to the repo
  94. exit_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
  95. if exit_code != 0:
  96. logger.error(f'Failed to cd to the repo: {output}')
  97. sys.exit(1)
  98. # remove all future commits & remote following Devin
  99. # https://www.cognition-labs.com/post/swe-bench-technical-report
  100. exit_code, output = sandbox.execute('git reset --hard')
  101. if exit_code != 0:
  102. logger.error(f'Failed to reset the repo: {output}')
  103. sys.exit(1)
  104. exit_code, output = sandbox.execute(
  105. 'for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
  106. )
  107. if exit_code != 0:
  108. logger.error(f'Failed to remove remote: {output}')
  109. sys.exit(1)
  110. except Exception:
  111. raise
  112. finally:
  113. # restore workspace_base and workspace_mount_path
  114. config.workspace_base = old_workspace_base
  115. config.workspace_mount_path = old_workspace_mount_path
  116. return sandbox
  117. def get_diff_patch(self):
  118. # add everything to the index
  119. exit_code, output = self.execute(f'cd /workspace/{self.workspace_dir_name}')
  120. if exit_code != 0:
  121. logger.error('Failed to cd to the repo')
  122. return ''
  123. # add everything to the index
  124. exit_code, output = self.execute('git add -A')
  125. if exit_code != 0:
  126. logger.error('Failed to add everything to the index')
  127. return ''
  128. # get the git diff
  129. exit_code, git_patch = self.execute(
  130. f'git diff --no-color --cached {self.swe_instance["base_commit"]}'
  131. )
  132. if exit_code != 0:
  133. logger.error('Failed to get git diff')
  134. return ''
  135. return git_patch
  136. if __name__ == '__main__':
  137. # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
  138. # so we don't need to manage file uploading to OpenDevin's repo
  139. dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
  140. swe_bench_tests = dataset['test'].to_pandas()
  141. # INSTANCE_ID = 'django__django-11099'
  142. INSTANCE_ID = 'astropy__astropy-12907'
  143. swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
  144. EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
  145. sandbox = SWEBenchSSHBox.get_box_for_instance(
  146. instance=EXAMPLE_INSTANCE,
  147. sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
  148. )
  149. # PRE TEST
  150. exit_code, output = sandbox.execute('cd $REPO_PATH')
  151. assert exit_code == 0, 'Failed to cd $REPO_PATH'
  152. logger.info(f'cd $REPO_PATH: {output}')
  153. # apply test patch
  154. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
  155. assert exit_code == 0, 'Failed to apply test patch'
  156. logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
  157. # TEST
  158. exit_code, output = sandbox.execute('$TEST_CMD')
  159. assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
  160. logger.info(f'$TEST_CMD:\n{output}')
  161. # apply gold patch
  162. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
  163. logger.info('exit code: %d', exit_code)
  164. logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
  165. # TEST
  166. exit_code, output = sandbox.execute('$TEST_CMD')
  167. assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
  168. logger.info(f'$TEST_CMD:\n{output}')
  169. # Reset the repo
  170. exit_code, output = sandbox.execute('git reset --hard')
  171. assert exit_code == 0, 'Failed to reset the repo'
  172. logger.info(f'git reset --hard: {output}')
  173. bg_cmd = sandbox.execute_in_background(
  174. "while true; do echo 'dot ' && sleep 10; done"
  175. )
  176. sys.stdout.flush()
  177. try:
  178. while True:
  179. try:
  180. user_input = input('>>> ')
  181. except EOFError:
  182. logger.info('Exiting...')
  183. break
  184. if user_input.lower() == 'exit':
  185. logger.info('Exiting...')
  186. break
  187. if user_input.lower() == 'kill':
  188. sandbox.kill_background(bg_cmd.pid)
  189. logger.info('Background process killed')
  190. continue
  191. exit_code, output = sandbox.execute(user_input)
  192. logger.info('exit code: %d', exit_code)
  193. logger.info(output)
  194. if bg_cmd.pid in sandbox.background_commands:
  195. logs = sandbox.read_logs(bg_cmd.pid)
  196. logger.info('background logs: %s', logs)
  197. sys.stdout.flush()
  198. except KeyboardInterrupt:
  199. logger.info('Exiting...')
  200. sandbox.close()