Просмотр исходного кода

Support Instance Level Images for SWE-Bench Evaluation (#2874)

* rename pulled instance images

* Swebench: add support to instance level images

* Update evaluation/swe_bench/run_infer.py

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>

* instance swebench: use env var and docker tags instead

* swebench disable instance report for instance images

* Update evaluation/swe_bench/README.md

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>

---------

Co-authored-by: Xingyao Wang <xingyao6@illinois.edu>
Jiayi Pan 1 год назад
Родитель
Сommit
7111e8ee14

+ 15 - 1
evaluation/swe_bench/README.md

@@ -75,6 +75,7 @@ Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench
 docker image. Then run this python script:
 
 ```bash
+# export USE_INSTANCE_IMAGE=true # if you want to test support for instance-level docker images
 poetry run python evaluation/swe_bench/swe_env_box.py
 ```
 
@@ -85,7 +86,7 @@ If you see an error, please make sure your `config.toml` contains all
 ## Run Inference on SWE-Bench Instances
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers]
 # e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview_llm HEAD CodeActAgent 300
 ```
 
@@ -104,7 +105,20 @@ to `CodeActAgent`.
 default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
 in order to use `eval_limit`, you must also set `agent`.
 
+`max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
+default, it is set to 30.
+
+`num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
+default, it is set to 1.
+
+There are also two optional environment variables you can set.
+```
+export USE_HINT_TEXT=true # if you want to use hint text in the evaluation. Ignore this if you are not sure.
+export USE_INSTANCE_IMAGE=true # if you want to use instance-level docker images
+```
+
 Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview_llm` and CodeActAgent,
+
 then your command would be:
 
 ```bash

+ 40 - 29
evaluation/swe_bench/run_infer.py

@@ -27,6 +27,7 @@ from opendevin.core.main import run_agent_controller
 from opendevin.llm.llm import LLM
 
 USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false') == 'true'
+USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -123,37 +124,45 @@ def get_test_result(instance, sandbox, workspace_dir_name):
     else:
         test_result['metadata']['5_reformat_instance_json_success'] = True
 
-    # Get the instance report
-    err_code, output = sandbox.execute(
-        (
-            'cd /swe_util/OD-SWE-bench '
-            '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
-            '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
-        )
-    )
-    if err_code != 0:
-        logger.error(f'Error getting instance report: {output}')
+    if USE_INSTANCE_IMAGE:
+        # instance report is not supported in instance image mode
         test_result['metadata']['6_get_instance_report_success'] = False
-        test_result['metadata']['6_get_instance_report_error'] = output
+        test_result['metadata']['6_get_instance_report_error'] = (
+            'Instance report is not supported in instance image mode.'
+        )
+
     else:
-        test_result['metadata']['6_get_instance_report_success'] = True
-        test_result['result_raw'] = output
-
-        # try to parse output
-        for line in output.strip().split('\n'):
-            line = line.strip('-')
-            try:
-                key, value = line.split(':')
-            except ValueError:
-                # skip this line
-                print(f'Error parsing result line: {line}')
-                continue
-            value = value.strip()
-            try:
-                value = int(value)
-            except ValueError:
-                pass
-            test_result['result'][key.strip()] = value
+        # Get the instance report
+        err_code, output = sandbox.execute(
+            (
+                'cd /swe_util/OD-SWE-bench '
+                '&& export PYTHONPATH=$(pwd):$PYTHONPATH '
+                '&& conda run -n swe-bench-eval python swebench/metrics/get_instance_report.py --swe_bench_task /workspace/instance.json --log_path /workspace/$SWE_INSTANCE_ID.log'
+            )
+        )
+        if err_code != 0:
+            logger.error(f'Error getting instance report: {output}')
+            test_result['metadata']['6_get_instance_report_success'] = False
+            test_result['metadata']['6_get_instance_report_error'] = output
+        else:
+            test_result['metadata']['6_get_instance_report_success'] = True
+            test_result['result_raw'] = output
+
+            # try to parse output
+            for line in output.strip().split('\n'):
+                line = line.strip('-')
+                try:
+                    key, value = line.split(':')
+                except ValueError:
+                    # skip this line
+                    print(f'Error parsing result line: {line}')
+                    continue
+                value = value.strip()
+                try:
+                    value = int(value)
+                except ValueError:
+                    pass
+                test_result['result'][key.strip()] = value
     return test_result
 
 
@@ -189,6 +198,7 @@ def process_instance(
         # Remove all existing handlers from logger
         for handler in logger.handlers[:]:
             logger.removeHandler(handler)
+        os.makedirs(os.path.dirname(log_file), exist_ok=True)
         file_handler = logging.FileHandler(log_file)
         file_handler.setFormatter(
             logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
@@ -205,6 +215,7 @@ def process_instance(
         workspace_dir_name,
         workspace_mount_path=workspace_mount_path,
         sandbox_plugins=agenthub.Agent.get_cls(metadata.agent_class).sandbox_plugins,
+        use_instance_image=USE_INSTANCE_IMAGE,
     )
 
     # Prepare instruction

+ 3 - 1
evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh

@@ -43,5 +43,7 @@ echo "Image file: $IMAGE_FILE"
 grep "$PATTERN" "$IMAGE_FILE" | while IFS= read -r image; do
     echo "Pulling $NAMESPACE/$image into $image"
     docker pull $NAMESPACE/$image
-    docker tag $NAMESPACE/$image $image
+    # replace _s_ to __ in the image name
+    renamed_image=$(echo "$image" | sed 's/_s_/__/g')
+    docker tag $NAMESPACE/$image $renamed_image
 done

+ 8 - 0
evaluation/swe_bench/scripts/run_infer.sh

@@ -26,6 +26,14 @@ if [ -z "$MAX_ITER" ]; then
   MAX_ITER=30
 fi
 
+if [ -z "$USE_INSTANCE_IMAGE" ]; then
+  echo "USE_INSTANCE_IMAGE not specified, use default false"
+  USE_INSTANCE_IMAGE=false
+fi
+
+export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
+echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
+
 get_agent_version
 
 echo "AGENT: $AGENT"

+ 77 - 0
evaluation/swe_bench/scripts/setup/instance_swe_entry.sh

@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# set -e
+
+# assert user name is `root`
+if [ "$USER" != "root" ]; then
+    echo "Error: This script is intended to be run by the 'root' user only." >&2
+    exit 1
+fi
+
+source ~/.bashrc
+
+SWEUTIL_DIR=/swe_util
+
+# Create logs directory
+LOG_DIR=/opendevin/logs
+mkdir -p $LOG_DIR && chmod 777 $LOG_DIR
+
+# FIXME: Cannot read SWE_INSTANCE_ID from the environment variable
+# SWE_INSTANCE_ID=django__django-11099
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+# Read the swe-bench-test-lite.json file and extract the required item based on instance_id
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+WORKSPACE_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
+
+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
+
+SWE_TASK_DIR=/opendevin/swe_tasks
+mkdir -p $SWE_TASK_DIR
+# Dump test_patch to /workspace/test.patch
+echo "$item" | jq -r '.test_patch' > $SWE_TASK_DIR/test.patch
+# Dump patch to /workspace/gold.patch
+echo "$item" | jq -r '.patch' > $SWE_TASK_DIR/gold.patch
+# Dump the item to /workspace/instance.json except for the "test_patch" and "patch" fields
+echo "$item" | jq 'del(.test_patch, .patch)' > $SWE_TASK_DIR/instance.json
+
+# Clear the workspace
+rm -rf /workspace/*
+# Copy repo to workspace
+if [ -d /workspace/$WORKSPACE_NAME ]; then
+    rm -rf /workspace/$WORKSPACE_NAME
+fi
+cp -r /testbed/ /workspace/$WORKSPACE_NAME/
+
+# Reset swe-bench testbed and install the repo
+. /opt/miniconda3/etc/profile.d/conda.sh
+conda activate testbed
+
+mkdir -p $SWE_TASK_DIR/reset_testbed_temp
+mkdir -p $SWE_TASK_DIR/reset_testbed_log_dir
+
+REPO_PATH=/workspace/$WORKSPACE_NAME
+echo "Repo Path: $REPO_PATH"
+echo "Test Command: $TEST_CMD"
+echo "export REPO_PATH=\"$REPO_PATH\"" >> ~/.bashrc
+# echo "export TEST_CMD=\"$TEST_CMD\"" >> ~/.bashrc
+
+if [[ "$REPO_PATH" == "None" ]]; then
+    echo "Error: Failed to retrieve repository path. Tests may not have passed or output was not as expected." >&2
+    exit 1
+fi
+
+# Activate instance-specific environment
+. /opt/miniconda3/etc/profile.d/conda.sh
+conda activate testbed
+
+# set +e

+ 78 - 6
evaluation/swe_bench/swe_env_box.py

@@ -1,7 +1,12 @@
+import json
+import os
 import sys
+import tempfile
 import uuid
 
 from datasets import load_dataset
+from swebench.harness.constants import MAP_REPO_TO_TEST_FRAMEWORK
+from swebench.harness.utils import get_test_directives
 
 from opendevin.core.config import config
 from opendevin.core.logger import opendevin_logger as logger
@@ -15,6 +20,10 @@ from opendevin.runtime.plugins import (
 SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.2.1'
 
 
+def get_image_name_from_instance_id(instance_id: str) -> str:
+    return 'sweb.eval.x86_64.' + instance_id
+
+
 class SWEBenchSSHBox(DockerSSHBox):
     def __init__(
         self,
@@ -26,6 +35,7 @@ class SWEBenchSSHBox(DockerSSHBox):
         skip_workspace_mount: bool = True,
         sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
         workspace_dir_name: str | None = None,
+        use_instance_image: bool = False,
     ):
         if swe_instance_id is None:
             raise ValueError('swe_instance_id must be provided!')
@@ -39,6 +49,7 @@ class SWEBenchSSHBox(DockerSSHBox):
         ), 'container_image is required for SWEBenchSSHBox!'
         # Need to run as root to use SWEBench container
         sid = f'swe_bench_{swe_instance_id}_' + str(uuid.uuid4())
+        logger.info(f'===Using container image: {container_image}')
         super().__init__(container_image, timeout, sid)
         self.init_plugins(sandbox_plugins)
 
@@ -54,11 +65,61 @@ class SWEBenchSSHBox(DockerSSHBox):
         logger.info(
             'Initialization of SWEBench may take approximately 10 minutes due to long-running installations, such as those requiring compilation.'
         )
-        exit_code, output = self.execute('source /swe_util/swe_entry.sh', timeout=600)
-        logger.info('exit code: %d', exit_code)
-        logger.info(output)
-        assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
-        logger.info('Sourced swe_entry.sh successfully')
+        logger.info(f'Use instance image: {use_instance_image}')
+        if use_instance_image:
+            # we directly inject the instance info into the container and the init script
+            script_dir = os.path.dirname(__file__)
+
+            # inject test command
+            test_type = MAP_REPO_TO_TEST_FRAMEWORK[swe_instance['repo']][
+                swe_instance['version']
+            ]
+            swe_instance['test_directives'] = get_test_directives(swe_instance)
+            swe_instance['test_cmd'] = (
+                f"{test_type} {' '.join(swe_instance['test_directives'])}"
+            )
+            exit_code, output = self.execute(
+                f"""echo "export TEST_CMD='{swe_instance["test_cmd"]}'" >> ~/.bashrc"""
+            )
+            # assert exit_code == 0, f'Failed to set TEST_CMD in ~/.bashrc: {output}'
+
+            # inject the instance info
+            self.execute('mkdir -p /swe_util/eval_data/instances')
+            swe_instance_json_name = 'swe-bench-instance.json'
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Construct the full path for the desired file name within the temporary directory
+                temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+                # Write to the file with the desired name within the temporary directory
+                with open(temp_file_path, 'w') as f:
+                    if not isinstance(swe_instance, dict):
+                        json.dump([swe_instance.to_dict()], f)
+                    else:
+                        json.dump([swe_instance], f)
+
+                # Copy the file to the desired location
+                self.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+            # inject the init script
+            self.copy_to(
+                str(os.path.join(script_dir, 'scripts/setup/instance_swe_entry.sh')),
+                '/swe_util/',
+            )
+            self.execute('cat ~/.bashrc')
+            self.execute('source ~/.bashrc')
+
+            self.execute('source /swe_util/instance_swe_entry.sh', timeout=600)
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
+            logger.info('Sourced swe_entry.sh successfully')
+        else:
+            exit_code, output = self.execute(
+                'source /swe_util/swe_entry.sh', timeout=600
+            )
+            logger.info('exit code: %d', exit_code)
+            logger.info(output)
+            assert exit_code == 0, f'Failed to source swe_entry.sh: {output}'
+            logger.info('Sourced swe_entry.sh successfully')
 
     @property
     def volumes(self):
@@ -78,6 +139,7 @@ class SWEBenchSSHBox(DockerSSHBox):
         skip_workspace_mount: bool = True,
         workspace_mount_path: str | None = None,
         sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
+        use_instance_image: bool = False,
     ) -> 'SWEBenchSSHBox':
         if workspace_dir_name is None:
             workspace_dir_name = f"{instance['repo']}__{instance['version']}".replace(
@@ -94,13 +156,20 @@ class SWEBenchSSHBox(DockerSSHBox):
             config.enable_auto_lint = True
             # Need to run as root to use SWEBench container
             config.run_as_devin = False
+            if use_instance_image:
+                container_image = get_image_name_from_instance_id(
+                    instance['instance_id']
+                )
+            else:
+                container_image = SWE_BENCH_CONTAINER_IMAGE
             sandbox = cls(
-                container_image=SWE_BENCH_CONTAINER_IMAGE,
+                container_image=container_image,
                 swe_instance_id=instance['instance_id'],
                 swe_instance=instance,
                 skip_workspace_mount=skip_workspace_mount,
                 sandbox_plugins=sandbox_plugins,
                 workspace_dir_name=workspace_dir_name,
+                use_instance_image=use_instance_image,
             )
             logger.info(f"SSH box started for instance {instance['instance_id']}.")
 
@@ -163,6 +232,8 @@ if __name__ == '__main__':
     # so we don't need to manage file uploading to OpenDevin's repo
     dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
     swe_bench_tests = dataset['test'].to_pandas()
+    USE_INSTANCE_IMAGE = os.environ.get('USE_INSTANCE_IMAGE', 'false') == 'true'
+    logger.info(f'USE_INSTANCE_IMAGE: {USE_INSTANCE_IMAGE}')
 
     # INSTANCE_ID = 'django__django-11099'
     INSTANCE_ID = 'astropy__astropy-12907'
@@ -172,6 +243,7 @@ if __name__ == '__main__':
     sandbox = SWEBenchSSHBox.get_box_for_instance(
         instance=EXAMPLE_INSTANCE,
         sandbox_plugins=[AgentSkillsRequirement(), JupyterRequirement()],
+        use_instance_image=USE_INSTANCE_IMAGE,
     )
 
     # PRE TEST