1 год назад · 7111e8ee14
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -75,6 +75,7 @@ Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench
 
				 docker image. Then run this python script:
			
 
				 
			
 
				 ```bash
			
 
				+# export USE_INSTANCE_IMAGE=true # if you want to test support for instance-level docker images
			
 
				 poetry run python evaluation/swe_bench/swe_env_box.py
			
 
				 ```
			
 
				 
			
@@ -85,7 +86,7 @@ If you see an error, please make sure your `config.toml` contains all
 
				 ## Run Inference on SWE-Bench Instances
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
			
 
				 # e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
			
 
				 ```
			
 
				 
			
@@ -104,7 +105,20 @@ to `CodeActAgent`.
 
				 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
			
 
				 in order to use `eval_limit`, you must also set `agent`.
			
 
				 
			
 
				+`max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
			
 
				+default, it is set to 30.
			
 
				+
			
 
				+`num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
			
 
				+default, it is set to 1.
			
 
				+
			
 
				+There are also two optional environment variables you can set.
			
 
				+```
			
 
				+export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Ignore this if you are not sure.
			
 
				+export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images
			
 
				+```
			
 
				+
			
 
				 Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,
			
 
				+
			
 
				 then your command would be:
			
 
				 
			
 
				 ```bash
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -27,6 +27,7 @@ from opendevin.core.main import run_agent_controller
 
				 from opendevin.llm.llm import LLM
			
 
				 
			
 
				 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
			
 
				+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
			
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': codeact_user_response,
			
@@ -123,37 +124,45 @@ def get_test_result(instance, sandbox, workspace_dir_name):
 
				     else:
			
 
				         test_result['metadata']['5_reformat_instance_json_success'] = True
			
 
				 
			
 
				-    # Get the instance report
			
 
				-    err_code, output = sandbox.execute(
			
 
				-        (
			
 
				-            'cd /swe_util/OD-SWE-bench '
			
 
				-            '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
			
 
				-            '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
			
 
				-        )
			
 
				-    )
			
 
				-    if err_code != 0:
			
 
				-        logger.error(f'Error getting instance report: {output}')
			
 
				+    if USE_INSTANCE_IMAGE:
			
 
				+        # instance report is not supported in instance image mode
			
 
				         test_result['metadata']['6_get_instance_report_success'] = False
			
 
				-        test_result['metadata']['6_get_instance_report_error'] = output
			
 
				+        test_result['metadata']['6_get_instance_report_error'] = (
			
 
				+            'Instance report is not supported in instance image mode.'
			
 
				+        )
			
 
				+
			
 
				     else:
			
 
				-        test_result['metadata']['6_get_instance_report_success'] = True
			
 
				-        test_result['result_raw'] = output
			
 
				-
			
 
				-        # try to parse output
			
 
				-        for line in output.strip().split('\n'):
			
 
				-            line = line.strip('-')
			
 
				-            try:
			
 
				-                key, value = line.split(':')
			
 
				-            except ValueError:
			
 
				-                # skip this line
			
 
				-                print(f'Error parsing result line: {line}')
			
 
				-                continue
			
 
				-            value = value.strip()
			
 
				-            try:
			
 
				-                value = int(value)
			
 
				-            except ValueError:
			
 
				-                pass
			
 
				-            test_result['result'][key.strip()] = value
			
 
				+        # Get the instance report
			
 
				+        err_code, output = sandbox.execute(
			
 
				+            (
			
 
				+                'cd /swe_util/OD-SWE-bench '
			
 
				+                '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
			
 
				+                '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
			
 
				+            )
			
 
				+        )
			
 
				+        if err_code != 0:
			
 
				+            logger.error(f'Error getting instance report: {output}')
			
 
				+            test_result['metadata']['6_get_instance_report_success'] = False
			
 
				+            test_result['metadata']['6_get_instance_report_error'] = output
			
 
				+        else:
			
 
				+            test_result['metadata']['6_get_instance_report_success'] = True
			
 
				+            test_result['result_raw'] = output
			
 
				+
			
 
				+            # try to parse output
			
 
				+            for line in output.strip().split('\n'):
			
 
				+                line = line.strip('-')
			
 
				+                try:
			
 
				+                    key, value = line.split(':')
			
 
				+                except ValueError:
			
 
				+                    # skip this line
			
 
				+                    print(f'Error parsing result line: {line}')
			
 
				+                    continue
			
 
				+                value = value.strip()
			
 
				+                try:
			
 
				+                    value = int(value)
			
 
				+                except ValueError:
			
 
				+                    pass
			
 
				+                test_result['result'][key.strip()] = value
			
 
				     return test_result
			
 
				 
			
 
				 
			
@@ -189,6 +198,7 @@ def process_instance(
 
				         # Remove all existing handlers from logger
			
 
				         for handler in logger.handlers[:]:
			
 
				             logger.removeHandler(handler)
			
 
				+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
			
 
				         file_handler = logging.FileHandler(log_file)
			
 
				         file_handler.setFormatter(
			
 
				             logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
@@ -205,6 +215,7 @@ def process_instance(
 
				         workspace_dir_name,
			
 
				         workspace_mount_path=workspace_mount_path,
			
 
				         sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
			
 
				+        use_instance_image=USE_INSTANCE_IMAGE,
			
 
				     )
			
 
				 
			
 
				     # Prepare instruction
			
--- a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
+++ b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
@@ -43,5 +43,7 @@ echo "Image file: $IMAGE_FILE"
 
				 grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
			
 
				     echo "Pulling $NAMESPACE/$image into $image"
			
 
				     docker pull $NAMESPACE/$image
			
 
				-    docker tag $NAMESPACE/$image $image
			
 
				+    # replace _s_ to __ in the image name
			
 
				+    renamed_image=$(echo "$image" | sed 's/_s_/__/g')
			
 
				+    docker tag $NAMESPACE/$image $renamed_image
			
 
				 done
			
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -26,6 +26,14 @@ if [ -z "$MAX_ITER" ]; then
 
				   MAX_ITER=30
			
 
				 fi
			
 
				 
			
 
				+if [ -z "$USE_INSTANCE_IMAGE" ]; then
			
 
				+  echo "USE_INSTANCE_IMAGE not specified, use default false"
			
 
				+  USE_INSTANCE_IMAGE=false
			
 
				+fi
			
 
				+
			
 
				+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
			
 
				+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
			
 
				+
			
 
				 get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
--- a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -0,0 +1,77 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# set -e
			
 
				+
			
 
				+# assert user name is `root`
			
 
				+if [ "$USER" != "root" ]; then
			
 
				+    echo "Error: This script is intended to be run by the 'root' user only." >&2
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+source ~/.bashrc
			
 
				+
			
 
				+SWEUTIL_DIR=/swe_util
			
 
				+
			
 
				+# Create logs directory
			
 
				+LOG_DIR=/opendevin/logs
			
 
				+mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
			
 
				+
			
 
				+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
			
 
				+# SWE_INSTANCE_ID=django__django-11099
			
 
				+if [ -z "$SWE_INSTANCE_ID" ]; then
			
 
				+    echo "Error: SWE_INSTANCE_ID is not set." >&2
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
			
 
				+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
			
 
				+
			
 
				+if [[ -z "$item" ]]; then
			
 
				+  echo "No item found for the provided instance ID."
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+WORKSPACE_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
			
 
				+
			
 
				+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
			
 
				+
			
 
				+SWE_TASK_DIR=/opendevin/swe_tasks
			
 
				+mkdir -p $SWE_TASK_DIR
			
 
				+# Dump test_patch to /workspace/test.patch
			
 
				+echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
			
 
				+# Dump patch to /workspace/gold.patch
			
 
				+echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
			
 
				+# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
			
 
				+echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
			
 
				+
			
 
				+# Clear the workspace
			
 
				+rm -rf /workspace/*
			
 
				+# Copy repo to workspace
			
 
				+if [ -d /workspace/$WORKSPACE_NAME ]; then
			
 
				+    rm -rf /workspace/$WORKSPACE_NAME
			
 
				+fi
			
 
				+cp -r /testbed/ /workspace/$WORKSPACE_NAME/
			
 
				+
			
 
				+# Reset swe-bench testbed and install the repo
			
 
				+. /opt/miniconda3/etc/profile.d/conda.sh
			
 
				+conda activate testbed
			
 
				+
			
 
				+mkdir -p $SWE_TASK_DIR/reset_testbed_temp
			
 
				+mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
			
 
				+
			
 
				+REPO_PATH=/workspace/$WORKSPACE_NAME
			
 
				+echo "Repo Path: $REPO_PATH"
			
 
				+echo "Test Command: $TEST_CMD"
			
 
				+echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
			
 
				+# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
			
 
				+
			
 
				+if [[ "$REPO_PATH" == "None" ]]; then
			
 
				+    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# Activate instance-specific environment
			
 
				+. /opt/miniconda3/etc/profile.d/conda.sh
			
 
				+conda activate testbed
			
 
				+
			
 
				+# set +e
			
--- a/evaluation/swe_bench/swe_env_box.py
+++ b/evaluation/swe_bench/swe_env_box.py
@@ -1,7 +1,12 @@
 
				+import json
			
 
				+import os
			
 
				 import sys
			
 
				+import tempfile
			
 
				 import uuid
			
 
				 
			
 
				 from datasets import load_dataset
			
 
				+from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
			
 
				+from swebench.harness.utils import get_test_directives
			
 
				 
			
 
				 from opendevin.core.config import config
			
 
				 from opendevin.core.logger import opendevin_logger as logger
			
@@ -15,6 +20,10 @@ from opendevin.runtime.plugins import (
 
				 SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
			
 
				 
			
 
				 
			
 
				+def get_image_name_from_instance_id(instance_id: str) -> str:
			
 
				+    return 'sweb.eval.x86_64.' + instance_id
			
 
				+
			
 
				+
			
 
				 class SWEBenchSSHBox(DockerSSHBox):
			
 
				     def __init__(
			
 
				         self,
			
@@ -26,6 +35,7 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				         skip_workspace_mount: bool = True,
			
 
				         sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
			
 
				         workspace_dir_name: str | None = None,
			
 
				+        use_instance_image: bool = False,
			
 
				     ):
			
 
				         if swe_instance_id is None:
			
 
				             raise ValueError('swe_instance_id must be provided!')
			
@@ -39,6 +49,7 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				         ), 'container_image is required for SWEBenchSSHBox!'
			
 
				         # Need to run as root to use SWEBench container
			
 
				         sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
			
 
				+        logger.info(f'===Using container image: {container_image}')
			
 
				         super().__init__(container_image, timeout, sid)
			
 
				         self.init_plugins(sandbox_plugins)
			
 
				 
			
@@ -54,11 +65,61 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				         logger.info(
			
 
				             'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
			
 
				         )
			
 
				-        exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
			
 
				-        logger.info('exit code: %d', exit_code)
			
 
				-        logger.info(output)
			
 
				-        assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
			
 
				-        logger.info('Sourced swe_entry.sh successfully')
			
 
				+        logger.info(f'Use instance image: {use_instance_image}')
			
 
				+        if use_instance_image:
			
 
				+            # we directly inject the instance info into the container and the init script
			
 
				+            script_dir = os.path.dirname(__file__)
			
 
				+
			
 
				+            # inject test command
			
 
				+            test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
			
 
				+                swe_instance['version']
			
 
				+            ]
			
 
				+            swe_instance['test_directives'] = get_test_directives(swe_instance)
			
 
				+            swe_instance['test_cmd'] = (
			
 
				+                f"{test_type} {' '.join(swe_instance['test_directives'])}"
			
 
				+            )
			
 
				+            exit_code, output = self.execute(
			
 
				+                f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
			
 
				+            )
			
 
				+            # assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'
			
 
				+
			
 
				+            # inject the instance info
			
 
				+            self.execute('mkdir -p /swe_util/eval_data/instances')
			
 
				+            swe_instance_json_name = 'swe-bench-instance.json'
			
 
				+            with tempfile.TemporaryDirectory() as temp_dir:
			
 
				+                # Construct the full path for the desired file name within the temporary directory
			
 
				+                temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
			
 
				+                # Write to the file with the desired name within the temporary directory
			
 
				+                with open(temp_file_path, 'w') as f:
			
 
				+                    if not isinstance(swe_instance, dict):
			
 
				+                        json.dump([swe_instance.to_dict()], f)
			
 
				+                    else:
			
 
				+                        json.dump([swe_instance], f)
			
 
				+
			
 
				+                # Copy the file to the desired location
			
 
				+                self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
			
 
				+
			
 
				+            # inject the init script
			
 
				+            self.copy_to(
			
 
				+                str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
			
 
				+                '/swe_util/',
			
 
				+            )
			
 
				+            self.execute('cat ~/.bashrc')
			
 
				+            self.execute('source ~/.bashrc')
			
 
				+
			
 
				+            self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
			
 
				+            logger.info('exit code: %d', exit_code)
			
 
				+            logger.info(output)
			
 
				+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
			
 
				+            logger.info('Sourced swe_entry.sh successfully')
			
 
				+        else:
			
 
				+            exit_code, output = self.execute(
			
 
				+                'source /swe_util/swe_entry.sh', timeout=600
			
 
				+            )
			
 
				+            logger.info('exit code: %d', exit_code)
			
 
				+            logger.info(output)
			
 
				+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
			
 
				+            logger.info('Sourced swe_entry.sh successfully')
			
 
				 
			
 
				     @property
			
 
				     def volumes(self):
			
@@ -78,6 +139,7 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				         skip_workspace_mount: bool = True,
			
 
				         workspace_mount_path: str | None = None,
			
 
				         sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
			
 
				+        use_instance_image: bool = False,
			
 
				     ) -> 'SWEBenchSSHBox':
			
 
				         if workspace_dir_name is None:
			
 
				             workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
			
@@ -94,13 +156,20 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				             config.enable_auto_lint = True
			
 
				             # Need to run as root to use SWEBench container
			
 
				             config.run_as_devin = False
			
 
				+            if use_instance_image:
			
 
				+                container_image = get_image_name_from_instance_id(
			
 
				+                    instance['instance_id']
			
 
				+                )
			
 
				+            else:
			
 
				+                container_image = SWE_BENCH_CONTAINER_IMAGE
			
 
				             sandbox = cls(
			
 
				-                container_image=SWE_BENCH_CONTAINER_IMAGE,
			
 
				+                container_image=container_image,
			
 
				                 swe_instance_id=instance['instance_id'],
			
 
				                 swe_instance=instance,
			
 
				                 skip_workspace_mount=skip_workspace_mount,
			
 
				                 sandbox_plugins=sandbox_plugins,
			
 
				                 workspace_dir_name=workspace_dir_name,
			
 
				+                use_instance_image=use_instance_image,
			
 
				             )
			
 
				             logger.info(f"SSH box started for instance {instance['instance_id']}.")
			
 
				 
			
@@ -163,6 +232,8 @@ if __name__ == '__main__':
 
				     # so we don't need to manage file uploading to OpenDevin's repo
			
 
				     dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
			
 
				     swe_bench_tests = dataset['test'].to_pandas()
			
 
				+    USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
			
 
				+    logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')
			
 
				 
			
 
				     # INSTANCE_ID = 'django__django-11099'
			
 
				     INSTANCE_ID = 'astropy__astropy-12907'
			
@@ -172,6 +243,7 @@ if __name__ == '__main__':
 
				     sandbox = SWEBenchSSHBox.get_box_for_instance(
			
 
				         instance=EXAMPLE_INSTANCE,
			
 
				         sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
			
 
				+        use_instance_image=USE_INSTANCE_IMAGE,
			
 
				     )
			
 
				 
			
 
				     # PRE TEST