swe_env_box.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. import sys
  2. import uuid
  3. from datasets import load_dataset
  4. from opendevin.core.config import config
  5. from opendevin.core.logger import opendevin_logger as logger
  6. from opendevin.runtime.docker.ssh_box import DockerSSHBox
  7. from opendevin.runtime.plugins import (
  8. AgentSkillsRequirement,
  9. JupyterRequirement,
  10. PluginRequirement,
  11. )
  12. SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
  13. class SWEBenchSSHBox(DockerSSHBox):
  14. def __init__(
  15. self,
  16. container_image: str,
  17. timeout: int = 120,
  18. sid: str | None = None,
  19. swe_instance_id: str | None = None,
  20. swe_instance: dict | None = None,
  21. skip_workspace_mount: bool = True,
  22. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  23. ):
  24. if swe_instance_id is None:
  25. raise ValueError('swe_instance_id must be provided!')
  26. self.swe_instance_id = swe_instance_id
  27. self.swe_instance = swe_instance
  28. self.skip_workspace_mount = skip_workspace_mount
  29. assert (
  30. container_image is not None
  31. ), 'container_image is required for SWEBenchSSHBox!'
  32. # Need to run as root to use SWEBench container
  33. sid = f'swe_bench_{swe_instance_id}' + str(uuid.uuid4())
  34. super().__init__(container_image, timeout, sid)
  35. self.init_plugins(sandbox_plugins)
  36. exit_code, output = self.execute('mv ~/.bashrc ~/.bashrc.bak')
  37. assert exit_code == 0, f'Failed to backup ~/.bashrc: {output}'
  38. exit_code, output = self.execute(
  39. f"echo 'export SWE_INSTANCE_ID={self.swe_instance_id}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo \"alias git='git --no-pager'\" >> ~/.bashrc"
  40. )
  41. assert exit_code == 0, f'Failed to set SWE_INSTANCE_ID in ~/.bashrc: {output}'
  42. logger.info('Sourcing swe_entry.sh to set up environment variables')
  43. # larger timeout for SWEBench init to account for long-running installations (e.g., require compilation)
  44. exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
  45. logger.info('exit code: %d', exit_code)
  46. logger.info(output)
  47. assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
  48. logger.info('Sourced swe_entry.sh successfully')
  49. @property
  50. def volumes(self):
  51. if self.skip_workspace_mount:
  52. return {
  53. k: v
  54. for k, v in super().volumes.items()
  55. if not v['bind'] == self.sandbox_workspace_dir
  56. }
  57. return super().volumes
  58. @classmethod
  59. def get_box_for_instance(
  60. cls,
  61. instance,
  62. workspace_dir_name=None,
  63. skip_workspace_mount: bool = True,
  64. workspace_mount_path: str | None = None,
  65. sandbox_plugins: list[PluginRequirement] = [], # noqa: B006
  66. ) -> 'SWEBenchSSHBox':
  67. if workspace_dir_name is None:
  68. workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
  69. '/', '__'
  70. )
  71. old_workspace_base = config.workspace_base
  72. old_workspace_mount_path = config.workspace_mount_path
  73. config.workspace_base = workspace_mount_path
  74. config.workspace_mount_path = workspace_mount_path
  75. # linting python after editing helps LLM fix indentations
  76. config.enable_auto_lint = True
  77. # Need to run as root to use SWEBench container
  78. config.run_as_devin = False
  79. sandbox = cls(
  80. container_image=SWE_BENCH_CONTAINER_IMAGE,
  81. swe_instance_id=instance['instance_id'],
  82. swe_instance=instance,
  83. skip_workspace_mount=skip_workspace_mount,
  84. sandbox_plugins=sandbox_plugins,
  85. )
  86. logger.info(f"SSH box started for instance {instance['instance_id']}.")
  87. # cd to the repo
  88. exit_code, output = sandbox.execute(f'cd /workspace/{workspace_dir_name}')
  89. if exit_code != 0:
  90. logger.error(f'Failed to cd to the repo: {output}')
  91. sys.exit(1)
  92. # remove all future commits & remote following Devin
  93. # https://www.cognition-labs.com/post/swe-bench-technical-report
  94. exit_code, output = sandbox.execute('git reset --hard')
  95. if exit_code != 0:
  96. logger.error(f'Failed to reset the repo: {output}')
  97. sys.exit(1)
  98. exit_code, output = sandbox.execute(
  99. 'for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
  100. )
  101. if exit_code != 0:
  102. logger.error(f'Failed to remove remote: {output}')
  103. sys.exit(1)
  104. # restore workspace_base and workspace_mount_path
  105. config.workspace_base = old_workspace_base
  106. config.workspace_mount_path = old_workspace_mount_path
  107. return sandbox
  108. def get_diff_patch(self):
  109. # add everything to the index
  110. exit_code, output = self.execute('git add --all')
  111. if exit_code != 0:
  112. logger.error('Failed to add everything to the index')
  113. return ''
  114. # get the git diff
  115. exit_code, git_patch = self.execute(
  116. f'git diff --no-color --cached {self.swe_instance["base_commit"]}'
  117. )
  118. if exit_code != 0:
  119. logger.error('Failed to get git diff')
  120. return ''
  121. return git_patch
  122. if __name__ == '__main__':
  123. # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
  124. # so we don't need to manage file uploading to OpenDevin's repo
  125. dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
  126. swe_bench_tests = dataset['test'].to_pandas()
  127. # INSTANCE_ID = 'django__django-11099'
  128. INSTANCE_ID = 'astropy__astropy-12907'
  129. swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
  130. EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
  131. sandbox = SWEBenchSSHBox.get_box_for_instance(
  132. instance=EXAMPLE_INSTANCE,
  133. sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
  134. )
  135. # PRE TEST
  136. exit_code, output = sandbox.execute('cd $REPO_PATH')
  137. assert exit_code == 0, 'Failed to cd $REPO_PATH'
  138. logger.info(f'cd $REPO_PATH: {output}')
  139. # apply test patch
  140. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
  141. assert exit_code == 0, 'Failed to apply test patch'
  142. logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
  143. # TEST
  144. exit_code, output = sandbox.execute('$TEST_CMD')
  145. assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
  146. logger.info(f'$TEST_CMD:\n{output}')
  147. # apply gold patch
  148. exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/gold.patch')
  149. logger.info('exit code: %d', exit_code)
  150. logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
  151. # TEST
  152. exit_code, output = sandbox.execute('$TEST_CMD')
  153. assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
  154. logger.info(f'$TEST_CMD:\n{output}')
  155. # Reset the repo
  156. exit_code, output = sandbox.execute('git reset --hard')
  157. assert exit_code == 0, 'Failed to reset the repo'
  158. logger.info(f'git reset --hard: {output}')
  159. bg_cmd = sandbox.execute_in_background(
  160. "while true; do echo 'dot ' && sleep 10; done"
  161. )
  162. sys.stdout.flush()
  163. try:
  164. while True:
  165. try:
  166. user_input = input('>>> ')
  167. except EOFError:
  168. logger.info('Exiting...')
  169. break
  170. if user_input.lower() == 'exit':
  171. logger.info('Exiting...')
  172. break
  173. if user_input.lower() == 'kill':
  174. sandbox.kill_background(bg_cmd.pid)
  175. logger.info('Background process killed')
  176. continue
  177. exit_code, output = sandbox.execute(user_input)
  178. logger.info('exit code: %d', exit_code)
  179. logger.info(output)
  180. if bg_cmd.pid in sandbox.background_commands:
  181. logs = sandbox.read_logs(bg_cmd.pid)
  182. logger.info('background logs: %s', logs)
  183. sys.stdout.flush()
  184. except KeyboardInterrupt:
  185. logger.info('Exiting...')
  186. sandbox.close()