1 year ago · 6ff50ed369
--- a/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
+++ b/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
@@ -34,6 +34,6 @@ Run the following command to do the above two steps. The results will be saved t
 
				 
			
 
				 ```bash
			
 
				 pushd evaluation/swe_bench
			
 
				-docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
			
 
				-docker push ghcr.io/opendevin/eval-swe-bench:full-v1.0
			
 
				+docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
 
				+docker push ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				 ```
			
--- a/evaluation/swe_bench/EVAL_PATCH.md
+++ b/evaluation/swe_bench/EVAL_PATCH.md
@@ -117,7 +117,7 @@ Before evaluating generated patches, you need to set up the Docker environment.
 
				 ```shell
			
 
				 docker run -it \
			
 
				 -v DIR_TO_YOUR_PATCH_FILES_ON_HOST:/swe_bench_output \
			
 
				-ghcr.io/opendevin/eval-swe-bench:full-v1.0 /bin/bash
			
 
				+ghcr.io/opendevin/eval-swe-bench:full-v1.1 /bin/bash
			
 
				 ```
			
 
				 
			
 
				 ### Evaluate Model Generated Patches
			
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -15,7 +15,7 @@ In [OpenDevin-SWE-Bench fork](https://github.com/OpenDevin/OD-SWE-bench.git) (mo
 
				 **We pack everything you need for SWE-Bench evaluation into one, gigantic, docker image.** To use it:
			
 
				 
			
 
				 ```bash
			
 
				-docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.0
			
 
				+docker pull ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				 ```
			
 
				 
			
 
				 The Docker image contains several important directories:
			
@@ -68,7 +68,7 @@ temperature = 0.0
 
				 
			
 
				 ## Test if your environment works
			
 
				 
			
 
				-Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.0`
			
 
				+Make sure your Docker daemon is running, and you have pulled the `eval-swe-bench:full-v1.1`
			
 
				 docker image. Then run this python script:
			
 
				 
			
 
				 ```bash
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -222,6 +222,8 @@ def process_instance(
 
				             logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				         )
			
 
				         logger.addHandler(file_handler)
			
 
				+    else:
			
 
				+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
			
 
				 
			
 
				     if not skip_workspace_mount:
			
 
				         logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.1
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.1
@@ -10,4 +10,4 @@ RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
 
				     /eval_workspace/ /swe_util/
			
 
				 
			
 
				 # pushd evaluation/SWE-bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.0 -f ./scripts/docker/Dockerfile.full.v1.0 .
			
 
				+# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
--- a/evaluation/swe_bench/scripts/eval_infer.sh
+++ b/evaluation/swe_bench/scripts/eval_infer.sh
@@ -26,7 +26,7 @@ docker run --rm \
 
				     -e OD_SWE_BENCH=/swe_util/OD-SWE-bench \
			
 
				     -e EVAL_DATA_DIR=/swe_util/eval_data \
			
 
				     -w /swe_util \
			
 
				-    ghcr.io/opendevin/eval-swe-bench:full-v1.0 \
			
 
				+    ghcr.io/opendevin/eval-swe-bench:full-v1.1 \
			
 
				     bash -c "./get_agent_report.sh --output-file /swe_bench_output/$FILE_NAME \
			
 
				     --agent-name CodeActAgent \
			
 
				     --dataset swe-bench-test-lite \
			
--- a/evaluation/swe_bench/swe_env_box.py
+++ b/evaluation/swe_bench/swe_env_box.py
@@ -1,12 +1,14 @@
 
				 import sys
			
 
				 import uuid
			
 
				 
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				 from opendevin.core.config import config
			
 
				 from opendevin.core.logger import opendevin_logger as logger
			
 
				 from opendevin.runtime.docker.ssh_box import DockerSSHBox
			
 
				 from opendevin.runtime.plugins import JupyterRequirement, SWEAgentCommandsRequirement
			
 
				 
			
 
				-SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.0'
			
 
				+SWE_BENCH_CONTAINER_IMAGE = 'ghcr.io/opendevin/eval-swe-bench:full-v1.1'
			
 
				 
			
 
				 
			
 
				 class SWEBenchSSHBox(DockerSSHBox):
			
@@ -123,20 +125,15 @@ class SWEBenchSSHBox(DockerSSHBox):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    EXAMPLE_INSTANCE = {
			
 
				-        'repo': 'django/django',
			
 
				-        'instance_id': 'django__django-11099',
			
 
				-        'base_commit': 'd26b2424437dabeeca94d7900b37d2df4410da0c',
			
 
				-        'patch': "diff --git a/django/contrib/auth/validators.py b/django/contrib/auth/validators.py\n--- a/django/contrib/auth/validators.py\n+++ b/django/contrib/auth/validators.py\n@@ -7,7 +7,7 @@\n \n @deconstructible\n class ASCIIUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only English letters, '\n         'numbers, and @/./+/-/_ characters.'\n@@ -17,7 +17,7 @@ class ASCIIUsernameValidator(validators.RegexValidator):\n \n @deconstructible\n class UnicodeUsernameValidator(validators.RegexValidator):\n-    regex = r'^[\\w.@+-]+$'\n+    regex = r'^[\\w.@+-]+\\Z'\n     message = _(\n         'Enter a valid username. This value may contain only letters, '\n         'numbers, and @/./+/-/_ characters.'\n",
			
 
				-        'test_patch': "diff --git a/tests/auth_tests/test_validators.py b/tests/auth_tests/test_validators.py\n--- a/tests/auth_tests/test_validators.py\n+++ b/tests/auth_tests/test_validators.py\n@@ -237,7 +237,7 @@ def test_unicode_validator(self):\n         invalid_usernames = [\n             \"o'connell\", \"عبد ال\",\n             \"zerowidth\\u200Bspace\", \"nonbreaking\\u00A0space\",\n-            \"en\\u2013dash\",\n+            \"en\\u2013dash\", 'trailingnewline\\u000A',\n         ]\n         v = validators.UnicodeUsernameValidator()\n         for valid in valid_usernames:\n@@ -250,7 +250,7 @@ def test_unicode_validator(self):\n \n     def test_ascii_validator(self):\n         valid_usernames = ['glenn', 'GLEnN', 'jean-marc']\n-        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\"]\n+        invalid_usernames = [\"o'connell\", 'Éric', 'jean marc', \"أحمد\", 'trailingnewline\\n']\n         v = validators.ASCIIUsernameValidator()\n         for valid in valid_usernames:\n             with self.subTest(valid=valid):\n",
			
 
				-        'problem_statement': "UsernameValidator allows trailing newline in usernames\nDescription\n\t\nASCIIUsernameValidator and UnicodeUsernameValidator use the regex \nr'^[\\w.@+-]+$'\nThe intent is to only allow alphanumeric characters as well as ., @, +, and -. However, a little known quirk of Python regexes is that $ will also match a trailing newline. Therefore, the user name validators will accept usernames which end with a newline. You can avoid this behavior by instead using \\A and \\Z to terminate regexes. For example, the validator regex could be changed to\nr'\\A[\\w.@+-]+\\Z'\nin order to reject usernames that end with a newline.\nI am not sure how to officially post a patch, but the required change is trivial - using the regex above in the two validators in contrib.auth.validators.\n",
			
 
				-        'hints_text': '',
			
 
				-        'created_at': '2019-03-20T03:46:18Z',
			
 
				-        'version': '3.0',
			
 
				-        'FAIL_TO_PASS': '["test_ascii_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_unicode_validator (auth_tests.test_validators.UsernameValidatorsTests)", "test_help_text (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)"]',
			
 
				-        'PASS_TO_PASS': '["test_help_text (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_validate (auth_tests.test_validators.MinimumLengthValidatorTest)", "test_help_text (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.NumericPasswordValidatorTest)", "test_validate (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_validate_property (auth_tests.test_validators.UserAttributeSimilarityValidatorTest)", "test_empty_password_validator_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_get_default_password_validators (auth_tests.test_validators.PasswordValidationTest)", "test_get_password_validators_custom (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed (auth_tests.test_validators.PasswordValidationTest)", "test_password_changed_with_custom_validator (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_text_html_escaping (auth_tests.test_validators.PasswordValidationTest)", "test_password_validators_help_texts (auth_tests.test_validators.PasswordValidationTest)", "test_validate_password (auth_tests.test_validators.PasswordValidationTest)", "test_help_text (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_custom_list (auth_tests.test_validators.CommonPasswordValidatorTest)", "test_validate_django_supplied_file (auth_tests.test_validators.CommonPasswordValidatorTest)"]',
			
 
				-        'environment_setup_commit': '419a78300f7cd27611196e1e464d50fd0385ff27',
			
 
				-    }
			
 
				+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				+    # so we don't need to manage file uploading to OpenDevin's repo
			
 
				+    dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
			
 
				+    swe_bench_tests = dataset['test'].to_pandas()
			
 
				+
			
 
				+    # INSTANCE_ID = 'django__django-11099'
			
 
				+    INSTANCE_ID = 'astropy__astropy-12907'
			
 
				+    swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
			
 
				+    EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()
			
 
				 
			
 
				     sandbox = SWEBenchSSHBox.get_box_for_instance(instance=EXAMPLE_INSTANCE)
			
 
				 
			
@@ -154,9 +151,7 @@ if __name__ == '__main__':
 
				     logger.info(f'git apply $SWE_TASK_DIR/test.patch: {output}')
			
 
				 
			
 
				     # TEST
			
 
				-    exit_code, output = sandbox.execute(
			
 
				-        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
			
 
				-    )
			
 
				+    exit_code, output = sandbox.execute('$TEST_CMD')
			
 
				     assert exit_code == 1, 'Expected exit code 1 (since this is a FAIL_TO_PASS)'
			
 
				     logger.info(f'$TEST_CMD:\n{output}')
			
 
				 
			
@@ -166,9 +161,7 @@ if __name__ == '__main__':
 
				     logger.info(f'git apply $SWE_TASK_DIR/gold.patch: {output}')
			
 
				 
			
 
				     # TEST
			
 
				-    exit_code, output = sandbox.execute(
			
 
				-        './tests/runtests.py --verbosity 2 auth_tests.test_validators'
			
 
				-    )
			
 
				+    exit_code, output = sandbox.execute('$TEST_CMD')
			
 
				     assert exit_code == 0, 'Expected exit code 0 (since we applied the gold patch)'
			
 
				     logger.info(f'$TEST_CMD:\n{output}')