1 rok temu · 688068a44e
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -63,11 +63,22 @@ then your command would be:
 
				 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				-**Evaluate on `RemoteRuntime` (alpha)** (contact Xingyao over slack if you want to try this out!)
			
 
				+### Run Inference on `RemoteRuntime`
			
 
				+
			
 
				+This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				+
			
 
				+```bash
			
 
				+# ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
			
 
				+# This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
			
 
				+```
			
 
				+
			
 
				+To clean-up all existing runtime you've already started, run:
			
 
				+
			
 
				 ```bash
			
 
				-SANDBOX_API_KEY="CONTACT-XINGYAO-TO-GET-A-TESTING-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
			
 
				 ```
			
 
				-Multi-processing is still WIP.
			
 
				 
			
 
				 ### Specify a subset of tasks to run infer
			
 
				 
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -26,7 +26,6 @@ from openhands.core.config import (
 
				     SandboxConfig,
			
 
				     get_llm_config_arg,
			
 
				     get_parser,
			
 
				-    load_from_env,
			
 
				 )
			
 
				 from openhands.core.logger import openhands_logger as logger
			
 
				 from openhands.core.main import create_runtime, run_controller
			
@@ -123,26 +122,19 @@ def get_config(
 
				         run_as_openhands=False,
			
 
				         max_budget_per_task=4,
			
 
				         max_iterations=metadata.max_iterations,
			
 
				+        runtime=os.environ.get('RUNTIME', 'eventstream'),
			
 
				         sandbox=SandboxConfig(
			
 
				             base_container_image=base_container_image,
			
 
				             enable_auto_lint=True,
			
 
				             use_host_network=False,
			
 
				             # large enough timeout, since some testcases take very long to run
			
 
				             timeout=300,
			
 
				+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				         ),
			
 
				         # do not mount workspace
			
 
				         workspace_base=None,
			
 
				         workspace_mount_path=None,
			
 
				     )
			
 
				-    selected_env_vars = {'runtime', 'sandbox_api_key'}
			
 
				-    selected_env_vars = {
			
 
				-        k: v for k, v in os.environ.items() if k.lower() in selected_env_vars
			
 
				-    }
			
 
				-    if selected_env_vars:
			
 
				-        logger.info(
			
 
				-            f'Loading config keys from env vars: {list(selected_env_vars.keys())}'
			
 
				-        )
			
 
				-        load_from_env(config, selected_env_vars)
			
 
				     config.set_llm_config(metadata.llm_config)
			
 
				     return config
			
 
				 
			
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -0,0 +1,21 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+
			
 
				+# API base URL
			
 
				+BASE_URL="https://api.all-hands.dev/v0"
			
 
				+
			
 
				+# Get the list of runtimes
			
 
				+runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
			
 
				+  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
			
 
				+
			
 
				+# Loop through each runtime and stop it
			
 
				+for runtime_id in $runtimes; do
			
 
				+  echo "Stopping runtime: ${runtime_id}"
			
 
				+  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
			
 
				+    --header "X-API-Key: ${ALLHANDS_API_KEY}" \
			
 
				+    --header "Content-Type: application/json" \
			
 
				+    --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
			
 
				+  echo
			
 
				+done
			
 
				+
			
 
				+echo "All runtimes have been stopped."
			
--- a/openhands/runtime/utils/runtime_build.py
+++ b/openhands/runtime/utils/runtime_build.py
@@ -1,4 +1,5 @@
 
				 import argparse
			
 
				+import hashlib
			
 
				 import os
			
 
				 import shutil
			
 
				 import subprocess
			
@@ -183,11 +184,25 @@ def get_runtime_image_repo_and_tag(base_image: str) -> tuple[str, str]:
 
				         if ':' not in base_image:
			
 
				             base_image = base_image + ':latest'
			
 
				         [repo, tag] = base_image.split(':')
			
 
				-        # replace '/' with '_s_' to avoid '/' in the image name
			
 
				-        # while make it a valid docker image name
			
 
				-        repo = repo.replace('/', '_s_')
			
 
				-        od_version = _get_package_version()
			
 
				-        return get_runtime_image_repo(), f'od_v{od_version}_image_{repo}_tag_{tag}'
			
 
				+        oh_version = _get_package_version()
			
 
				+
			
 
				+        # Hash the repo if it's too long
			
 
				+        if len(repo) > 32:
			
 
				+            repo_hash = hashlib.md5(repo[:-24].encode()).hexdigest()[:8]
			
 
				+            repo = f'{repo_hash}_{repo[-24:]}'  # Use 8 char hash + last 24 chars
			
 
				+        else:
			
 
				+            repo = repo.replace('/', '_s_')
			
 
				+
			
 
				+        new_tag = f'oh_v{oh_version}_image_{repo}_tag_{tag}'
			
 
				+
			
 
				+        # if it's still too long, hash the entire image name
			
 
				+        if len(new_tag) > 128:
			
 
				+            new_tag = f'oh_v{oh_version}_image_{hashlib.md5(new_tag.encode()).hexdigest()[:64]}'
			
 
				+            logger.warning(
			
 
				+                f'The new tag [{new_tag}] is still too long, so we use an hash of the entire image name: {new_tag}'
			
 
				+            )
			
 
				+
			
 
				+        return get_runtime_image_repo(), new_tag
			
 
				 
			
 
				 
			
 
				 def build_runtime_image(
			
--- a/tests/unit/test_runtime_build.py
+++ b/tests/unit/test_runtime_build.py
@@ -17,7 +17,7 @@ from openhands.runtime.utils.runtime_build import (
 
				     prep_docker_build_folder,
			
 
				 )
			
 
				 
			
 
				-OD_VERSION = f'od_v{_get_package_version()}'
			
 
				+OH_VERSION = f'oh_v{_get_package_version()}'
			
 
				 
			
 
				 
			
 
				 @pytest.fixture
			
@@ -176,7 +176,7 @@ def test_get_runtime_image_repo_and_tag_eventstream():
 
				     img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
			
 
				     assert (
			
 
				         img_repo == f'{get_runtime_image_repo()}'
			
 
				-        and img_tag == f'{OD_VERSION}_image_debian_tag_11'
			
 
				+        and img_tag == f'{OH_VERSION}_image_debian_tag_11'
			
 
				     )
			
 
				 
			
 
				     base_image = 'nikolaik/python-nodejs:python3.11-nodejs22'
			
@@ -184,14 +184,14 @@ def test_get_runtime_image_repo_and_tag_eventstream():
 
				     assert (
			
 
				         img_repo == f'{get_runtime_image_repo()}'
			
 
				         and img_tag
			
 
				-        == f'{OD_VERSION}_image_nikolaik_s_python-nodejs_tag_python3.11-nodejs22'
			
 
				+        == f'{OH_VERSION}_image_nikolaik_s_python-nodejs_tag_python3.11-nodejs22'
			
 
				     )
			
 
				 
			
 
				     base_image = 'ubuntu'
			
 
				     img_repo, img_tag = get_runtime_image_repo_and_tag(base_image)
			
 
				     assert (
			
 
				         img_repo == f'{get_runtime_image_repo()}'
			
 
				-        and img_tag == f'{OD_VERSION}_image_ubuntu_tag_latest'
			
 
				+        and img_tag == f'{OH_VERSION}_image_ubuntu_tag_latest'
			
 
				     )
			
 
				 
			
 
				 
			
@@ -215,7 +215,7 @@ def test_build_runtime_image_from_scratch(temp_dir):
 
				         path=ANY,
			
 
				         tags=[
			
 
				             f'{get_runtime_image_repo()}:{from_scratch_hash}',
			
 
				-            f'{get_runtime_image_repo()}:{OD_VERSION}_image_debian_tag_11',
			
 
				+            f'{get_runtime_image_repo()}:{OH_VERSION}_image_debian_tag_11',
			
 
				         ],
			
 
				     )
			
 
				     assert image_name == f'{get_runtime_image_repo()}:{from_scratch_hash}'