Przeglądaj źródła

[eval] Allow evaluation of SWE-Bench patches on `RemoteRuntime` (#3927)

Co-authored-by: Boxuan Li <liboxuan@connect.hku.hk>
Co-authored-by: Graham Neubig <neubig@gmail.com>
Xingyao Wang 1 rok temu
rodzic
commit
5d7f2fd4ae

+ 18 - 1
evaluation/swe_bench/README.md

@@ -63,7 +63,7 @@ then your command would be:
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
 ```
 ```
 
 
-### Run Inference on `RemoteRuntime`
+### Run Inference on `RemoteRuntime` (experimental)
 
 
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 
@@ -157,6 +157,23 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
 - `logs/`: a directory of test logs
 - `logs/`: a directory of test logs
 
 
+### Run evaluation with `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
+```
+
+To clean-up all existing runtimes that you've already started, run:
+
+```bash
+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+```
+
+
 ## Visualize Results
 ## Visualize Results
 
 
 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.

+ 349 - 0
evaluation/swe_bench/eval_infer.py

@@ -0,0 +1,349 @@
+import os
+import tempfile
+import time
+
+import pandas as pd
+from pydantic import BaseModel
+from swebench.harness.grading import get_eval_report
+from swebench.harness.run_evaluation import (
+    APPLY_PATCH_FAIL,
+    APPLY_PATCH_PASS,
+)
+from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
+from swebench.harness.utils import load_swebench_dataset
+
+from evaluation.swe_bench.run_infer import get_instance_docker_image
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    run_evaluation,
+)
+from openhands.core.config import (
+    AppConfig,
+    SandboxConfig,
+    get_parser,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime
+from openhands.events.action import CmdRunAction
+from openhands.events.observation import CmdOutputObservation
+
+# TODO: migrate all swe-bench docker to ghcr.io/openhands
+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
+
+
+def get_config(instance: pd.Series) -> AppConfig:
+    # We use a different instance image for the each instance of swe-bench eval
+    base_container_image = get_instance_docker_image(instance['instance_id'])
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+    config = AppConfig(
+        run_as_openhands=False,
+        runtime=os.environ.get('RUNTIME', 'eventstream'),
+        sandbox=SandboxConfig(
+            base_container_image=base_container_image,
+            use_host_network=False,
+            # large enough timeout, since some testcases take very long to run
+            timeout=1800,
+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
+        ),
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    return config
+
+
+class SWEBenchEvalResult(BaseModel):
+    instance_id: str
+    apply_patch_output: str
+    test_output: str
+    resolved: bool
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata | None = None,
+    reset_logger: bool = True,
+) -> EvalOutput:
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        global output_file
+        log_dir = output_file.replace('.jsonl', '.logs')
+        os.makedirs(log_dir, exist_ok=True)
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    config = get_config(instance)
+    instance_id = instance.instance_id
+    model_patch = instance['model_patch']
+    test_spec: TestSpec = instance['test_spec']
+    logger.info(f'Starting evaluation for instance {instance_id}.')
+
+    if 'test_result' not in instance.keys():
+        instance['test_result'] = {}
+    instance['test_result']['report'] = {
+        'empty_generation': False,
+        'resolved': False,
+        'failed_apply_patch': False,
+        'error_eval': False,
+    }
+
+    if model_patch == '':
+        instance['test_result']['report']['empty_generation'] = True
+        return EvalOutput(
+            instance_id=instance_id,
+            test_result=instance['test_result'],
+        )
+
+    runtime = create_runtime(config, sid=instance_id)
+
+    # Get patch and save it to /tmp/patch.diff
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Patch file
+        patch_file_path = os.path.join(temp_dir, 'patch.diff')
+        with open(patch_file_path, 'w') as f:
+            f.write(model_patch)
+        runtime.copy_to(patch_file_path, '/tmp')
+        # Eval script
+        eval_script_path = os.path.join(temp_dir, 'eval.sh')
+        with open(eval_script_path, 'w') as f:
+            f.write(test_spec.eval_script)
+        runtime.copy_to(eval_script_path, '/tmp')
+
+    # Set +x
+    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
+    action.timeout = 600
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert obs.exit_code == 0
+
+    # Apply patch
+    exec_command = (
+        'cd /testbed && '
+        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
+        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
+        "echo 'APPLY_PATCH_FAIL')))"
+    )
+    action = CmdRunAction(command=exec_command, keep_prompt=False)
+    action.timeout = 600
+    obs = runtime.run_action(action)
+    assert isinstance(obs, CmdOutputObservation)
+    apply_patch_output = obs.content
+    assert isinstance(apply_patch_output, str)
+    instance['test_result']['apply_patch_output'] = apply_patch_output
+
+    try:
+        if 'APPLY_PATCH_FAIL' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
+            instance['test_result']['report']['failed_apply_patch'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        elif 'APPLY_PATCH_PASS' in apply_patch_output:
+            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
+
+            # Run eval script in background and save output to log file
+            log_file = '/tmp/eval_output.log'
+            action = CmdRunAction(
+                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
+            )
+            action.timeout = 60  # Short timeout just to get the process ID
+            obs = runtime.run_action(action)
+
+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
+                pid = obs.content.split()[-1].strip()
+                logger.info(
+                    f'[{instance_id}] Evaluation process started with PID: {pid}'
+                )
+
+                # Poll for completion
+                start_time = time.time()
+                timeout = 900  # 15 minutes
+                while True:
+                    seconds_elapsed = time.time() - start_time
+                    if seconds_elapsed > timeout:
+                        logger.info(
+                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
+                        )
+                        break
+                    check_action = CmdRunAction(
+                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
+                    )
+                    check_action.timeout = 60
+                    check_obs = runtime.run_action(check_action)
+                    if (
+                        isinstance(check_obs, CmdOutputObservation)
+                        and check_obs.content.split()[-1].strip() == '1'
+                    ):
+                        logger.info(
+                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
+                        )
+                        break
+                    logger.info(
+                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
+                    )
+                    time.sleep(30)  # Wait for 30 seconds before checking again
+
+                # Read the log file
+                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
+                cat_action.timeout = 300
+                cat_obs = runtime.run_action(cat_action)
+
+                # Grade answer
+                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
+                    test_output = cat_obs.content
+                    assert isinstance(test_output, str)
+                    instance['test_result']['test_output'] = test_output
+
+                    # Get report from test output
+                    logger.info(f'[{instance_id}] Grading answer...')
+                    with tempfile.TemporaryDirectory() as temp_dir:
+                        # Create a directory structure that matches the expected format
+                        # NOTE: this is a hack to make the eval report format consistent
+                        # with the original SWE-Bench eval script
+                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
+                        os.makedirs(log_dir, exist_ok=True)
+                        test_output_path = os.path.join(log_dir, 'test_output.txt')
+                        with open(test_output_path, 'w') as f:
+                            f.write(test_output)
+
+                        _report = get_eval_report(
+                            test_spec=test_spec,
+                            prediction={
+                                'model_patch': model_patch,
+                                'instance_id': instance_id,
+                            },
+                            log_path=test_output_path,
+                            include_tests_status=True,
+                        )
+                        report = _report[instance_id]
+                        logger.info(
+                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
+                        )
+                        instance['test_result']['report']['resolved'] = report[
+                            'resolved'
+                        ]
+            else:
+                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
+                instance['test_result']['report']['error_eval'] = True
+
+            return EvalOutput(
+                instance_id=instance_id,
+                test_result=instance['test_result'],
+            )
+        else:
+            logger.info(
+                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
+            )
+            raise RuntimeError(
+                instance_id,
+                f'Unexpected output when applying patch:\n{apply_patch_output}',
+                logger,
+            )
+    finally:
+        runtime.close()
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    parser.add_argument(
+        '--input-file',
+        type=str,
+        help='Path to input predictions file',
+        required=True,
+    )
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='princeton-nlp/SWE-bench',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    args, _ = parser.parse_known_args()
+
+    # Load SWE-Bench dataset
+    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
+        args.dataset, args.split
+    )
+    instance_id_to_instance = {
+        instance['instance_id']: instance for instance in full_dataset
+    }
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
+    )
+
+    # Load predictions
+    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
+    predictions = pd.read_json(args.input_file, lines=True)
+    assert (
+        'instance_id' in predictions.columns
+    ), 'Input file must contain instance_id column.'
+
+    if 'model_patch' not in predictions.columns and (
+        'test_result' in predictions.columns
+        and 'model_patch' in predictions['test_result'].iloc[0]
+    ):
+        raise ValueError(
+            'Input file must contain model_patch column OR test_result column with model_patch field.'
+        )
+    assert len(predictions['instance_id'].unique()) == len(
+        predictions
+    ), 'instance_id column must be unique.'
+
+    if 'model_patch' not in predictions.columns:
+        predictions['model_patch'] = predictions['test_result'].apply(
+            lambda x: x['git_patch']
+        )
+    assert {'instance_id', 'model_patch'}.issubset(
+        set(predictions.columns)
+    ), 'Input file must contain instance_id and model_patch columns.'
+
+    # Merge predictions with dataset
+    predictions['instance'] = predictions['instance_id'].apply(
+        lambda x: instance_id_to_instance[x]
+    )
+    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
+
+    # Prepare dataset
+    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
+    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
+
+    run_evaluation(
+        instances,
+        metadata=None,
+        output_file=output_file,
+        num_workers=args.eval_num_workers,
+        process_instance_func=process_instance,
+    )
+
+    # Load evaluated predictions & print number of resolved predictions
+    evaluated_predictions = pd.read_json(output_file, lines=True)
+    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
+
+    def count_report_field(row, field):
+        return row['test_result']['report'][field]
+
+    for field in fields:
+        count = evaluated_predictions.apply(
+            count_report_field, args=(field,), axis=1
+        ).sum()
+        logger.info(
+            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
+        )

+ 10 - 4
evaluation/swe_bench/scripts/cleanup_remote_runtime.sh

@@ -5,17 +5,23 @@
 BASE_URL="https://api.all-hands.dev/v0"
 BASE_URL="https://api.all-hands.dev/v0"
 
 
 # Get the list of runtimes
 # Get the list of runtimes
-runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
-  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
+response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+  --header "X-API-Key: ${ALLHANDS_API_KEY}")
 
 
+n_runtimes=$(echo $response | jq -r '.total')
+echo "Found ${n_runtimes} runtimes. Stopping them..."
+
+runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 # Loop through each runtime and stop it
 # Loop through each runtime and stop it
-for runtime_id in $runtimes; do
-  echo "Stopping runtime: ${runtime_id}"
+counter=1
+for runtime_id in $runtime_ids; do
+  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
   curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
   curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
     --header "X-API-Key: ${ALLHANDS_API_KEY}" \
     --header "X-API-Key: ${ALLHANDS_API_KEY}" \
     --header "Content-Type: application/json" \
     --header "Content-Type: application/json" \
     --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
     --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
   echo
   echo
+  ((counter++))
 done
 done
 
 
 echo "All runtimes have been stopped."
 echo "All runtimes have been stopped."

+ 27 - 0
evaluation/swe_bench/scripts/eval/download_gold_patch.py

@@ -0,0 +1,27 @@
+import argparse
+
+import pandas as pd
+from datasets import load_dataset
+
+parser = argparse.ArgumentParser()
+parser.add_argument('output_filepath', type=str, help='Path to save the output file')
+parser.add_argument(
+    '--dataset_name',
+    type=str,
+    help='Name of the dataset to download',
+    default='princeton-nlp/SWE-bench_Lite',
+)
+parser.add_argument('--split', type=str, help='Split to download', default='test')
+args = parser.parse_args()
+
+dataset = load_dataset(args.dataset_name, split=args.split)
+output_filepath = args.output_filepath
+print(
+    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
+)
+patches = [
+    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
+]
+print(f'{len(patches)} gold patches loaded')
+pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
+print(f'Patches saved to {output_filepath}')

+ 43 - 0
evaluation/swe_bench/scripts/eval_infer_remote.sh

@@ -0,0 +1,43 @@
+#!/bin/bash
+set -eo pipefail
+
+INPUT_FILE=$1
+NUM_WORKERS=$2
+DATASET=$3
+SPLIT=$4
+
+if [ -z "$INPUT_FILE" ]; then
+  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
+  exit 1
+fi
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
+  DATASET="princeton-nlp/SWE-bench_Lite"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "SPLIT not specified, use default test"
+  SPLIT="test"
+fi
+
+if [ -z "$NUM_WORKERS" ]; then
+  echo "NUM_WORKERS not specified, use default 1"
+  NUM_WORKERS=1
+fi
+
+echo "... Evaluating on $INPUT_FILE ..."
+
+COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
+  --eval-num-workers $NUM_WORKERS \
+  --input-file $INPUT_FILE \
+  --dataset $DATASET \
+  --split $SPLIT"
+
+if [ -n "$EVAL_LIMIT" ]; then
+  echo "EVAL_LIMIT: $EVAL_LIMIT"
+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+fi
+
+# Run the command
+eval $COMMAND

+ 33 - 15
evaluation/utils/shared.py

@@ -50,15 +50,16 @@ class EvalMetadata(BaseModel):
 class EvalOutput(BaseModel):
 class EvalOutput(BaseModel):
     # NOTE: User-specified
     # NOTE: User-specified
     instance_id: str
     instance_id: str
-    instruction: str
     # output of the evaluation
     # output of the evaluation
     # store anything that is needed for the score calculation
     # store anything that is needed for the score calculation
     test_result: dict[str, Any]
     test_result: dict[str, Any]
 
 
+    instruction: str | None = None
+
     # Interaction info
     # Interaction info
-    metadata: EvalMetadata
-    history: list[tuple[dict[str, Any], dict[str, Any]]]
-    metrics: dict[str, Any]
+    metadata: EvalMetadata | None = None
+    history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None
+    metrics: dict[str, Any] | None = None
     error: str | None = None
     error: str | None = None
 
 
     # Optionally save the input test instance
     # Optionally save the input test instance
@@ -66,15 +67,19 @@ class EvalOutput(BaseModel):
 
 
     def model_dump(self, *args, **kwargs):
     def model_dump(self, *args, **kwargs):
         dumped_dict = super().model_dump(*args, **kwargs)
         dumped_dict = super().model_dump(*args, **kwargs)
+        # Remove None values
+        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
         # Apply custom serialization for metadata (to avoid leaking sensitive information)
         # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = self.metadata.model_dump()
+        if self.metadata is not None:
+            dumped_dict['metadata'] = self.metadata.model_dump()
         return dumped_dict
         return dumped_dict
 
 
     def model_dump_json(self, *args, **kwargs):
     def model_dump_json(self, *args, **kwargs):
         dumped = super().model_dump_json(*args, **kwargs)
         dumped = super().model_dump_json(*args, **kwargs)
         dumped_dict = json.loads(dumped)
         dumped_dict = json.loads(dumped)
         # Apply custom serialization for metadata (to avoid leaking sensitive information)
         # Apply custom serialization for metadata (to avoid leaking sensitive information)
-        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
+        if 'metadata' in dumped_dict:
+            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
         return json.dumps(dumped_dict)
         return json.dumps(dumped_dict)
 
 
 
 
@@ -260,32 +265,41 @@ def _process_instance_wrapper(
             result = process_instance_func(instance, metadata, use_mp)
             result = process_instance_func(instance, metadata, use_mp)
             return result
             return result
         except Exception as e:
         except Exception as e:
+            error = str(e)
+            stacktrace = traceback.format_exc()
             if attempt == max_retries:
             if attempt == max_retries:
+                logger.exception(e)
+                msg = (
+                    '-' * 10
+                    + '\n'
+                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
+                    + '\n'
+                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
+                    + '-' * 10
+                )
                 # Raise an error after all retries & stop the evaluation
                 # Raise an error after all retries & stop the evaluation
                 raise RuntimeError(
                 raise RuntimeError(
                     f'Maximum error retries reached for instance {instance.instance_id}'
                     f'Maximum error retries reached for instance {instance.instance_id}'
                 ) from e
                 ) from e
-            error = str(e)
-            stacktrace = traceback.format_exc()
             msg = (
             msg = (
                 '-' * 10
                 '-' * 10
                 + '\n'
                 + '\n'
                 + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
                 + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
                 + '\n'
                 + '\n'
                 + '-' * 10
                 + '-' * 10
-                + '[This error occurred after maximum retries]'
+                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
                 + '-' * 10
                 + '-' * 10
                 + '\n'
                 + '\n'
             )
             )
             logger.error(msg)
             logger.error(msg)
             if use_mp:
             if use_mp:
                 print(msg)  # use print to directly print to console
                 print(msg)  # use print to directly print to console
-            time.sleep(1)  # Add a small delay before retrying
+            time.sleep(5)
 
 
 
 
 def run_evaluation(
 def run_evaluation(
     dataset: pd.DataFrame,
     dataset: pd.DataFrame,
-    metadata: EvalMetadata,
+    metadata: EvalMetadata | None,
     output_file: str,
     output_file: str,
     num_workers: int,
     num_workers: int,
     process_instance_func: Callable[
     process_instance_func: Callable[
@@ -294,10 +308,14 @@ def run_evaluation(
     max_retries: int = 5,  # number of retries for each instance
     max_retries: int = 5,  # number of retries for each instance
 ):
 ):
     use_multiprocessing = num_workers > 1
     use_multiprocessing = num_workers > 1
-    logger.info(
-        f'Evaluation started with Agent {metadata.agent_class}:\n'
-        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
-    )
+
+    if metadata is not None:
+        logger.info(
+            f'Evaluation started with Agent {metadata.agent_class}:\n'
+            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+        )
+    else:
+        logger.info(f'Evaluation started with {num_workers} workers.')
 
 
     total_instances = len(dataset)
     total_instances = len(dataset)
     pbar = tqdm(total=total_instances, desc='Instances processed')
     pbar = tqdm(total=total_instances, desc='Instances processed')

+ 2 - 0
openhands/runtime/remote/runtime.py

@@ -197,6 +197,8 @@ class RemoteRuntime(Runtime):
             # because the runtime might just be starting up
             # because the runtime might just be starting up
             # and have not registered the endpoint yet
             # and have not registered the endpoint yet
             retry_fns=[is_404_error],
             retry_fns=[is_404_error],
+            # leave enough time for the runtime to start up
+            timeout=600,
         )
         )
         if response.status_code != 200:
         if response.status_code != 200:
             msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.'
             msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.'