1 year ago · 5114230e53
--- a/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
+++ b/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
@@ -34,6 +34,6 @@ Run the following command to do the above two steps. The results will be saved t
 
				 
			
 
				 ```bash
			
 
				 pushd evaluation/swe_bench
			
 
				-docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
 
				-docker push ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				+docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
 
				+docker push ghcr.io/opendevin/eval-swe-bench:full-v1.2
			
 
				 ```
			
--- a/evaluation/swe_bench/EVAL_PATCH.md
+++ b/evaluation/swe_bench/EVAL_PATCH.md
@@ -117,7 +117,7 @@ Before evaluating generated patches, you need to set up the Docker environment.
 
				 ```shell
			
 
				 docker run -it \
			
 
				 -v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
			
 
				-ghcr.io/opendevin/eval-swe-bench:full-v1.1 /bin/bash
			
 
				+ghcr.io/opendevin/eval-swe-bench:full-v1.2 /bin/bash
			
 
				 ```
			
 
				 
			
 
				 ### Evaluate Model Generated Patches
			
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -15,7 +15,7 @@ In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mo
 
				 **We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:
			
 
				 
			
 
				 ```bash
			
 
				-docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				+docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.2
			
 
				 ```
			
 
				 
			
 
				 The Docker image contains several important directories:
			
@@ -68,7 +68,7 @@ temperature = 0.0
 
				 
			
 
				 ## Test if your environment works
			
 
				 
			
 
				-Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.1`
			
 
				+Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.2`
			
 
				 docker image. Then run this python script:
			
 
				 
			
 
				 ```bash
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -24,6 +24,8 @@ from opendevin.core.main import main
 
				 from opendevin.events.action import MessageAction
			
 
				 from opendevin.events.serialization.event import event_to_dict
			
 
				 
			
 
				+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
			
 
				+
			
 
				 
			
 
				 def cleanup():
			
 
				     print('Cleaning up child processes...')
			
@@ -247,7 +249,7 @@ def process_instance(
 
				         '# Problem Statement\n'
			
 
				         f'{instance.problem_statement}\n\n'
			
 
				     )
			
 
				-    if instance.hints_text:
			
 
				+    if USE_HINT_TEXT and instance.hints_text:
			
 
				         instruction += f'# Hints\n{instance.hints_text}\n\n'
			
 
				     instruction += (
			
 
				         'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
			
@@ -346,7 +348,7 @@ if __name__ == '__main__':
 
				         eval_note += '_N_' + args.eval_note
			
 
				     eval_output_dir = os.path.join(
			
 
				         args.eval_output_dir,
			
 
				-        'swe_bench',
			
 
				+        'swe_bench_lite',
			
 
				         agent_class,
			
 
				         model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				     )
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2
@@ -0,0 +1,12 @@
 
				+FROM ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				+
			
 
				+RUN apt-get update && apt-get install -y \
			
 
				+    libgl1-mesa-glx \
			
 
				+    && rm -rf /var/lib/apt/lists/*
			
 
				+
			
 
				+# install basic dependencies for CodeActAgent
			
 
				+RUN pip3 install --upgrade pip
			
 
				+RUN pip3 install jupyterlab notebook jupyter_kernel_gateway flake8
			
 
				+# TODO: those dependencies are needed for agentskills, we should pack them in a new sandbox image
			
 
				+RUN pip3 install python-docx PyPDF2 python-pptx pylatexenc openai opencv-python
			
 
				+# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2 -f ./scripts/docker/Dockerfile.full.v1.2 .
			
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -26,7 +26,7 @@ docker run --rm \
 
				     -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
			
 
				     -e EVAL_DATA_DIR=/swe_util/eval_data \
			
 
				     -w /swe_util \
			
 
				-    ghcr.io/opendevin/eval-swe-bench:full-v1.1 \
			
 
				+    ghcr.io/opendevin/eval-swe-bench:full-v1.2 \
			
 
				     bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
			
 
				     --agent-name CodeActAgent \
			
 
				     --dataset swe-bench-test-lite \
			
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -16,13 +16,26 @@ echo "AGENT: $AGENT"
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				 echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				 
			
 
				+# Default to use Hint
			
 
				+if [ -z "$USE_HINT_TEXT" ]; then
			
 
				+  export USE_HINT_TEXT=true
			
 
				+fi
			
 
				+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
			
 
				+EVAL_NOTE="$AGENT_VERSION"
			
 
				+# if not using Hint, add -no-hint to the eval note
			
 
				+if [ "$USE_HINT_TEXT" = false ]; then
			
 
				+  EVAL_NOTE="$EVAL_NOTE-no-hint"
			
 
				+fi
			
 
				+
			
 
				+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
			
 
				+
			
 
				 COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
			
 
				   --agent-cls $AGENT \
			
 
				   --llm-config $MODEL_CONFIG \
			
 
				   --max-iterations 30 \
			
 
				   --max-chars 10000000 \
			
 
				   --eval-num-workers 8 \
			
 
				-  --eval-note $AGENT_VERSION"
			
 
				+  --eval-note $EVAL_NOTE"
			
 
				 
			
 
				 if [ -n "$EVAL_LIMIT" ]; then
			
 
				   echo "EVAL_LIMIT: $EVAL_LIMIT"
			
--- a/evaluation/swe_bench/swe_env_box.py
+++ b/evaluation/swe_bench/swe_env_box.py
@@ -12,7 +12,7 @@ from opendevin.runtime.plugins import (
 
				     PluginRequirement,
			
 
				 )
			
 
				 
			
 
				-SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.1'
			
 
				+SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2'
			
 
				 
			
 
				 
			
 
				 class SWEBenchSSHBox(DockerSSHBox):
			
@@ -79,6 +79,8 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				             workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
			
 
				                 '/', '__'
			
 
				             )
			
 
				+        old_workspace_base = config.workspace_base
			
 
				+        old_workspace_mount_path = config.workspace_mount_path
			
 
				         config.workspace_base = workspace_mount_path
			
 
				         config.workspace_mount_path = workspace_mount_path
			
 
				 
			
@@ -112,6 +114,10 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				         if exit_code != 0:
			
 
				             logger.error(f'Failed to remove remote: {output}')
			
 
				             sys.exit(1)
			
 
				+
			
 
				+        # restore workspace_base and workspace_mount_path
			
 
				+        config.workspace_base = old_workspace_base
			
 
				+        config.workspace_mount_path = old_workspace_mount_path
			
 
				         return sandbox
			
 
				 
			
 
				     def get_diff_patch(self):
			
--- a/opendevin/runtime/plugins/jupyter/execute_cli
+++ b/opendevin/runtime/plugins/jupyter/execute_cli
@@ -1,27 +1,3 @@
 
				-#!/usr/bin/env python3
			
 
				-import os
			
 
				-import sys
			
 
				-import time
			
 
				-
			
 
				-import requests
			
 
				-
			
 
				-# Read the Python code from STDIN
			
 
				-code = sys.stdin.read()
			
 
				-
			
 
				-# Set the default kernel ID
			
 
				-kernel_id = 'default'
			
 
				-
			
 
				-PORT = os.environ.get('JUPYTER_EXEC_SERVER_PORT')
			
 
				-POST_URL = f'http://localhost:{PORT}/execute'
			
 
				-
			
 
				-for i in range(10):
			
 
				-    try:
			
 
				-        response = requests.post(POST_URL, json={'kernel_id': kernel_id, 'code': code})
			
 
				-        if '500: Internal Server Error' not in response.text:
			
 
				-            print(response.text)
			
 
				-            break
			
 
				-    except requests.exceptions.ConnectionError:
			
 
				-        pass
			
 
				-    time.sleep(2)
			
 
				-else:
			
 
				-    print('Failed to connect to the Jupyter server')
			
 
				+#!/bin/bash
			
 
				+# Run the Python script with the specified interpreter
			
 
				+$OPENDEVIN_PYTHON_INTERPRETER /opendevin/plugins/jupyter/execute_cli.py
			
--- a/opendevin/runtime/plugins/jupyter/execute_cli.py
+++ b/opendevin/runtime/plugins/jupyter/execute_cli.py
@@ -0,0 +1,26 @@
 
				+import os
			
 
				+import sys
			
 
				+import time
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+# Read the Python code from STDIN
			
 
				+code = sys.stdin.read()
			
 
				+
			
 
				+# Set the default kernel ID
			
 
				+kernel_id = 'default'
			
 
				+
			
 
				+PORT = os.environ.get('JUPYTER_EXEC_SERVER_PORT')
			
 
				+POST_URL = f'http://localhost:{PORT}/execute'
			
 
				+
			
 
				+for i in range(10):
			
 
				+    try:
			
 
				+        response = requests.post(POST_URL, json={'kernel_id': kernel_id, 'code': code})
			
 
				+        if '500: Internal Server Error' not in response.text:
			
 
				+            print(response.text)
			
 
				+            break
			
 
				+    except requests.exceptions.ConnectionError:
			
 
				+        pass
			
 
				+    time.sleep(2)
			
 
				+else:
			
 
				+    print('Failed to connect to the Jupyter server')
			
--- a/opendevin/runtime/plugins/jupyter/setup.sh
+++ b/opendevin/runtime/plugins/jupyter/setup.sh
@@ -7,15 +7,20 @@ source ~/.bashrc
 
				 echo 'export PATH=$PATH:/opendevin/plugins/jupyter' >> ~/.bashrc
			
 
				 export PATH=/opendevin/plugins/jupyter:$PATH
			
 
				 
			
 
				+# get current PythonInterpreter
			
 
				+OPENDEVIN_PYTHON_INTERPRETER=$(which python3)
			
 
				+
			
 
				 # if user name is `opendevin`, add '/home/opendevin/.local/bin' to PATH
			
 
				 if [ "$USER" = "opendevin" ]; then
			
 
				     echo 'export PATH=$PATH:/home/opendevin/.local/bin' >> ~/.bashrc
			
 
				+    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
			
 
				     export PATH=$PATH:/home/opendevin/.local/bin
			
 
				     export PIP_CACHE_DIR=$HOME/.cache/pip
			
 
				 fi
			
 
				 # if user name is `root`, add '/root/.local/bin' to PATH
			
 
				 if [ "$USER" = "root" ]; then
			
 
				     echo 'export PATH=$PATH:/root/.local/bin' >> ~/.bashrc
			
 
				+    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
			
 
				     export PATH=$PATH:/root/.local/bin
			
 
				     export PIP_CACHE_DIR=$HOME/.cache/pip
			
 
				 
			
--- a/opendevin/runtime/plugins/mixin.py
+++ b/opendevin/runtime/plugins/mixin.py
@@ -22,6 +22,9 @@ class PluginMixin:
 
				     def init_plugins(self: SandboxProtocol, requirements: list[PluginRequirement]):
			
 
				         """Load a plugin into the sandbox."""
			
 
				 
			
 
				+        if hasattr(self, 'plugin_initialized') and self.plugin_initialized:
			
 
				+            return
			
 
				+
			
 
				         # clean-up ~/.bashrc and touch ~/.bashrc
			
 
				         exit_code, output = self.execute('rm -f ~/.bashrc && touch ~/.bashrc')
			
 
				 
			
@@ -65,3 +68,5 @@ class PluginMixin:
 
				                     f'Failed to source ~/.bashrc with exit code {exit_code} and output: {output}'
			
 
				                 )
			
 
				             logger.info('Sourced ~/.bashrc successfully')
			
 
				+
			
 
				+        self.plugin_initialized = True