Browse Source

Some SWE-Bench infer fixes and improvements (#2065)

* reset workspace base properly

* support running without hint

* support running without hint

* bump swe-bench eval docker to v1.2 for latest agentskills

* only give hint when use hint text is trie

* add swe-agent instructions for validation

* update dockerfile

* pin the python interpreter for execute_cli

* avoid initialize plugins twice

* default to use hint

* save results to swe_bench_lite

* unset gh token and increase max iter to 50

* remove printing of use hint status

* refractor ssh login into one function

* ok drop to 30 turns bc it is so expensive :(

* remove reproduce comments to avoid stuck
Xingyao Wang 1 year ago
parent
commit
5114230e53

+ 2 - 2
evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md

@@ -34,6 +34,6 @@ Run the following command to do the above two steps. The results will be saved t
 
 ```bash
 pushd evaluation/swe_bench
-docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
-docker push ghcr.io/opendevin/eval-swe-bench:full-v1.1
+docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2 -f ./scripts/docker/Dockerfile.full.v1.1 .
+docker push ghcr.io/opendevin/eval-swe-bench:full-v1.2
 ```

+ 1 - 1
evaluation/swe_bench/EVAL_PATCH.md

@@ -117,7 +117,7 @@ Before evaluating generated patches, you need to set up the Docker environment.
 ```shell
 docker run -it \
 -v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
-ghcr.io/opendevin/eval-swe-bench:full-v1.1 /bin/bash
+ghcr.io/opendevin/eval-swe-bench:full-v1.2 /bin/bash
 ```
 
 ### Evaluate Model Generated Patches

+ 2 - 2
evaluation/swe_bench/README.md

@@ -15,7 +15,7 @@ In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mo
 **We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:
 
 ```bash
-docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.1
+docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.2
 ```
 
 The Docker image contains several important directories:
@@ -68,7 +68,7 @@ temperature = 0.0
 
 ## Test if your environment works
 
-Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.1`
+Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.2`
 docker image. Then run this python script:
 
 ```bash

+ 4 - 2
evaluation/swe_bench/run_infer.py

@@ -24,6 +24,8 @@ from opendevin.core.main import main
 from opendevin.events.action import MessageAction
 from opendevin.events.serialization.event import event_to_dict
 
+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
+
 
 def cleanup():
     print('Cleaning up child processes...')
@@ -247,7 +249,7 @@ def process_instance(
         '# Problem Statement\n'
         f'{instance.problem_statement}\n\n'
     )
-    if instance.hints_text:
+    if USE_HINT_TEXT and instance.hints_text:
         instruction += f'# Hints\n{instance.hints_text}\n\n'
     instruction += (
         'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
@@ -346,7 +348,7 @@ if __name__ == '__main__':
         eval_note += '_N_' + args.eval_note
     eval_output_dir = os.path.join(
         args.eval_output_dir,
-        'swe_bench',
+        'swe_bench_lite',
         agent_class,
         model_name + '_maxiter_' + str(max_iterations) + eval_note,
     )

+ 12 - 0
evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2

@@ -0,0 +1,12 @@
+FROM ghcr.io/opendevin/eval-swe-bench:full-v1.1
+
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    && rm -rf /var/lib/apt/lists/*
+
+# install basic dependencies for CodeActAgent
+RUN pip3 install --upgrade pip
+RUN pip3 install jupyterlab notebook jupyter_kernel_gateway flake8
+# TODO: those dependencies are needed for agentskills, we should pack them in a new sandbox image
+RUN pip3 install python-docx PyPDF2 python-pptx pylatexenc openai opencv-python
+# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2 -f ./scripts/docker/Dockerfile.full.v1.2 .

+ 1 - 1
evaluation/swe_bench/scripts/eval_infer.sh

@@ -26,7 +26,7 @@ docker run --rm \
     -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
     -e EVAL_DATA_DIR=/swe_util/eval_data \
     -w /swe_util \
-    ghcr.io/opendevin/eval-swe-bench:full-v1.1 \
+    ghcr.io/opendevin/eval-swe-bench:full-v1.2 \
     bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
     --agent-name CodeActAgent \
     --dataset swe-bench-test-lite \

+ 14 - 1
evaluation/swe_bench/scripts/run_infer.sh

@@ -16,13 +16,26 @@ echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
+# Default to use Hint
+if [ -z "$USE_HINT_TEXT" ]; then
+  export USE_HINT_TEXT=true
+fi
+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
+EVAL_NOTE="$AGENT_VERSION"
+# if not using Hint, add -no-hint to the eval note
+if [ "$USE_HINT_TEXT" = false ]; then
+  EVAL_NOTE="$EVAL_NOTE-no-hint"
+fi
+
+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+
 COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
   --max-chars 10000000 \
   --eval-num-workers 8 \
-  --eval-note $AGENT_VERSION"
+  --eval-note $EVAL_NOTE"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 7 - 1
evaluation/swe_bench/swe_env_box.py

@@ -12,7 +12,7 @@ from opendevin.runtime.plugins import (
     PluginRequirement,
 )
 
-SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.1'
+SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2'
 
 
 class SWEBenchSSHBox(DockerSSHBox):
@@ -79,6 +79,8 @@ class SWEBenchSSHBox(DockerSSHBox):
             workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
                 '/', '__'
             )
+        old_workspace_base = config.workspace_base
+        old_workspace_mount_path = config.workspace_mount_path
         config.workspace_base = workspace_mount_path
         config.workspace_mount_path = workspace_mount_path
 
@@ -112,6 +114,10 @@ class SWEBenchSSHBox(DockerSSHBox):
         if exit_code != 0:
             logger.error(f'Failed to remove remote: {output}')
             sys.exit(1)
+
+        # restore workspace_base and workspace_mount_path
+        config.workspace_base = old_workspace_base
+        config.workspace_mount_path = old_workspace_mount_path
         return sandbox
 
     def get_diff_patch(self):

+ 3 - 27
opendevin/runtime/plugins/jupyter/execute_cli

@@ -1,27 +1,3 @@
-#!/usr/bin/env python3
-import os
-import sys
-import time
-
-import requests
-
-# Read the Python code from STDIN
-code = sys.stdin.read()
-
-# Set the default kernel ID
-kernel_id = 'default'
-
-PORT = os.environ.get('JUPYTER_EXEC_SERVER_PORT')
-POST_URL = f'http://localhost:{PORT}/execute'
-
-for i in range(10):
-    try:
-        response = requests.post(POST_URL, json={'kernel_id': kernel_id, 'code': code})
-        if '500: Internal Server Error' not in response.text:
-            print(response.text)
-            break
-    except requests.exceptions.ConnectionError:
-        pass
-    time.sleep(2)
-else:
-    print('Failed to connect to the Jupyter server')
+#!/bin/bash
+# Run the Python script with the specified interpreter
+$OPENDEVIN_PYTHON_INTERPRETER /opendevin/plugins/jupyter/execute_cli.py

+ 26 - 0
opendevin/runtime/plugins/jupyter/execute_cli.py

@@ -0,0 +1,26 @@
+import os
+import sys
+import time
+
+import requests
+
+# Read the Python code from STDIN
+code = sys.stdin.read()
+
+# Set the default kernel ID
+kernel_id = 'default'
+
+PORT = os.environ.get('JUPYTER_EXEC_SERVER_PORT')
+POST_URL = f'http://localhost:{PORT}/execute'
+
+for i in range(10):
+    try:
+        response = requests.post(POST_URL, json={'kernel_id': kernel_id, 'code': code})
+        if '500: Internal Server Error' not in response.text:
+            print(response.text)
+            break
+    except requests.exceptions.ConnectionError:
+        pass
+    time.sleep(2)
+else:
+    print('Failed to connect to the Jupyter server')

+ 5 - 0
opendevin/runtime/plugins/jupyter/setup.sh

@@ -7,15 +7,20 @@ source ~/.bashrc
 echo 'export PATH=$PATH:/opendevin/plugins/jupyter' >> ~/.bashrc
 export PATH=/opendevin/plugins/jupyter:$PATH
 
+# get current PythonInterpreter
+OPENDEVIN_PYTHON_INTERPRETER=$(which python3)
+
 # if user name is `opendevin`, add '/home/opendevin/.local/bin' to PATH
 if [ "$USER" = "opendevin" ]; then
     echo 'export PATH=$PATH:/home/opendevin/.local/bin' >> ~/.bashrc
+    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
     export PATH=$PATH:/home/opendevin/.local/bin
     export PIP_CACHE_DIR=$HOME/.cache/pip
 fi
 # if user name is `root`, add '/root/.local/bin' to PATH
 if [ "$USER" = "root" ]; then
     echo 'export PATH=$PATH:/root/.local/bin' >> ~/.bashrc
+    echo "export OPENDEVIN_PYTHON_INTERPRETER=$OPENDEVIN_PYTHON_INTERPRETER" >> ~/.bashrc
     export PATH=$PATH:/root/.local/bin
     export PIP_CACHE_DIR=$HOME/.cache/pip
 

+ 5 - 0
opendevin/runtime/plugins/mixin.py

@@ -22,6 +22,9 @@ class PluginMixin:
     def init_plugins(self: SandboxProtocol, requirements: list[PluginRequirement]):
         """Load a plugin into the sandbox."""
 
+        if hasattr(self, 'plugin_initialized') and self.plugin_initialized:
+            return
+
         # clean-up ~/.bashrc and touch ~/.bashrc
         exit_code, output = self.execute('rm -f ~/.bashrc && touch ~/.bashrc')
 
@@ -65,3 +68,5 @@ class PluginMixin:
                     f'Failed to source ~/.bashrc with exit code {exit_code} and output: {output}'
                 )
             logger.info('Sourced ~/.bashrc successfully')
+
+        self.plugin_initialized = True