1 год назад · 5d7f2fd4ae
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -63,7 +63,7 @@ then your command would be:
 
				 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				-### Run Inference on `RemoteRuntime`
			
 
				+### Run Inference on `RemoteRuntime` (experimental)
			
 
				 
			
 
				 This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				 
			
@@ -157,6 +157,23 @@ The final results will be saved to `evaluation/evaluation_outputs/outputs/swe_be
 
				 - `report.json`: a JSON file that contains keys like `"resolved_ids"` pointing to instance IDs that are resolved by the agent.
			
 
				 - `logs/`: a directory of test logs
			
 
				 
			
 
				+### Run evaluation with `RemoteRuntime` (experimental)
			
 
				+
			
 
				+This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				+
			
 
				+```bash
			
 
				+# ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
			
 
				+# This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
			
 
				+```
			
 
				+
			
 
				+To clean-up all existing runtimes that you've already started, run:
			
 
				+
			
 
				+```bash
			
 
				+ALLHANDS_API_KEY="YOUR-API-KEY" ./evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
			
 
				+```
			
 
				+
			
 
				+
			
 
				 ## Visualize Results
			
 
				 
			
 
				 First you need to clone `https://huggingface.co/spaces/OpenHands/evaluation` and add your own running results from openhands into the `outputs` of the cloned repo.
			
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -0,0 +1,349 @@
 
				+import os
			
 
				+import tempfile
			
 
				+import time
			
 
				+
			
 
				+import pandas as pd
			
 
				+from pydantic import BaseModel
			
 
				+from swebench.harness.grading import get_eval_report
			
 
				+from swebench.harness.run_evaluation import (
			
 
				+    APPLY_PATCH_FAIL,
			
 
				+    APPLY_PATCH_PASS,
			
 
				+)
			
 
				+from swebench.harness.test_spec import SWEbenchInstance, TestSpec, make_test_spec
			
 
				+from swebench.harness.utils import load_swebench_dataset
			
 
				+
			
 
				+from evaluation.swe_bench.run_infer import get_instance_docker_image
			
 
				+from evaluation.utils.shared import (
			
 
				+    EvalMetadata,
			
 
				+    EvalOutput,
			
 
				+    prepare_dataset,
			
 
				+    reset_logger_for_multiprocessing,
			
 
				+    run_evaluation,
			
 
				+)
			
 
				+from openhands.core.config import (
			
 
				+    AppConfig,
			
 
				+    SandboxConfig,
			
 
				+    get_parser,
			
 
				+)
			
 
				+from openhands.core.logger import openhands_logger as logger
			
 
				+from openhands.core.main import create_runtime
			
 
				+from openhands.events.action import CmdRunAction
			
 
				+from openhands.events.observation import CmdOutputObservation
			
 
				+
			
 
				+# TODO: migrate all swe-bench docker to ghcr.io/openhands
			
 
				+DOCKER_IMAGE_PREFIX = os.environ.get('EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/')
			
 
				+logger.info(f'Using docker image prefix: {DOCKER_IMAGE_PREFIX}')
			
 
				+
			
 
				+
			
 
				+def get_config(instance: pd.Series) -> AppConfig:
			
 
				+    # We use a different instance image for the each instance of swe-bench eval
			
 
				+    base_container_image = get_instance_docker_image(instance['instance_id'])
			
 
				+    logger.info(
			
 
				+        f'Using instance container image: {base_container_image}. '
			
 
				+        f'Please make sure this image exists. '
			
 
				+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
			
 
				+    )
			
 
				+    config = AppConfig(
			
 
				+        run_as_openhands=False,
			
 
				+        runtime=os.environ.get('RUNTIME', 'eventstream'),
			
 
				+        sandbox=SandboxConfig(
			
 
				+            base_container_image=base_container_image,
			
 
				+            use_host_network=False,
			
 
				+            # large enough timeout, since some testcases take very long to run
			
 
				+            timeout=1800,
			
 
				+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				+        ),
			
 
				+        # do not mount workspace
			
 
				+        workspace_base=None,
			
 
				+        workspace_mount_path=None,
			
 
				+    )
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+class SWEBenchEvalResult(BaseModel):
			
 
				+    instance_id: str
			
 
				+    apply_patch_output: str
			
 
				+    test_output: str
			
 
				+    resolved: bool
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance: pd.Series,
			
 
				+    metadata: EvalMetadata | None = None,
			
 
				+    reset_logger: bool = True,
			
 
				+) -> EvalOutput:
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
			
 
				+    if reset_logger:
			
 
				+        global output_file
			
 
				+        log_dir = output_file.replace('.jsonl', '.logs')
			
 
				+        os.makedirs(log_dir, exist_ok=True)
			
 
				+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
			
 
				+    else:
			
 
				+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
			
 
				+
			
 
				+    config = get_config(instance)
			
 
				+    instance_id = instance.instance_id
			
 
				+    model_patch = instance['model_patch']
			
 
				+    test_spec: TestSpec = instance['test_spec']
			
 
				+    logger.info(f'Starting evaluation for instance {instance_id}.')
			
 
				+
			
 
				+    if 'test_result' not in instance.keys():
			
 
				+        instance['test_result'] = {}
			
 
				+    instance['test_result']['report'] = {
			
 
				+        'empty_generation': False,
			
 
				+        'resolved': False,
			
 
				+        'failed_apply_patch': False,
			
 
				+        'error_eval': False,
			
 
				+    }
			
 
				+
			
 
				+    if model_patch == '':
			
 
				+        instance['test_result']['report']['empty_generation'] = True
			
 
				+        return EvalOutput(
			
 
				+            instance_id=instance_id,
			
 
				+            test_result=instance['test_result'],
			
 
				+        )
			
 
				+
			
 
				+    runtime = create_runtime(config, sid=instance_id)
			
 
				+
			
 
				+    # Get patch and save it to /tmp/patch.diff
			
 
				+    with tempfile.TemporaryDirectory() as temp_dir:
			
 
				+        # Patch file
			
 
				+        patch_file_path = os.path.join(temp_dir, 'patch.diff')
			
 
				+        with open(patch_file_path, 'w') as f:
			
 
				+            f.write(model_patch)
			
 
				+        runtime.copy_to(patch_file_path, '/tmp')
			
 
				+        # Eval script
			
 
				+        eval_script_path = os.path.join(temp_dir, 'eval.sh')
			
 
				+        with open(eval_script_path, 'w') as f:
			
 
				+            f.write(test_spec.eval_script)
			
 
				+        runtime.copy_to(eval_script_path, '/tmp')
			
 
				+
			
 
				+    # Set +x
			
 
				+    action = CmdRunAction(command='chmod +x /tmp/eval.sh')
			
 
				+    action.timeout = 600
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert obs.exit_code == 0
			
 
				+
			
 
				+    # Apply patch
			
 
				+    exec_command = (
			
 
				+        'cd /testbed && '
			
 
				+        "(git apply -v /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
			
 
				+        "(echo 'Failed to apply patch with git apply, trying with patch command...' && "
			
 
				+        "(patch --batch --fuzz=5 -p1 -i /tmp/patch.diff && echo 'APPLY_PATCH_PASS' || "
			
 
				+        "echo 'APPLY_PATCH_FAIL')))"
			
 
				+    )
			
 
				+    action = CmdRunAction(command=exec_command, keep_prompt=False)
			
 
				+    action.timeout = 600
			
 
				+    obs = runtime.run_action(action)
			
 
				+    assert isinstance(obs, CmdOutputObservation)
			
 
				+    apply_patch_output = obs.content
			
 
				+    assert isinstance(apply_patch_output, str)
			
 
				+    instance['test_result']['apply_patch_output'] = apply_patch_output
			
 
				+
			
 
				+    try:
			
 
				+        if 'APPLY_PATCH_FAIL' in apply_patch_output:
			
 
				+            logger.info(f'[{instance_id}] {APPLY_PATCH_FAIL}:\n{apply_patch_output}')
			
 
				+            instance['test_result']['report']['failed_apply_patch'] = True
			
 
				+
			
 
				+            return EvalOutput(
			
 
				+                instance_id=instance_id,
			
 
				+                test_result=instance['test_result'],
			
 
				+            )
			
 
				+        elif 'APPLY_PATCH_PASS' in apply_patch_output:
			
 
				+            logger.info(f'[{instance_id}] {APPLY_PATCH_PASS}:\n{apply_patch_output}')
			
 
				+
			
 
				+            # Run eval script in background and save output to log file
			
 
				+            log_file = '/tmp/eval_output.log'
			
 
				+            action = CmdRunAction(
			
 
				+                command=f'/tmp/eval.sh > {log_file} 2>&1 & echo $!', keep_prompt=False
			
 
				+            )
			
 
				+            action.timeout = 60  # Short timeout just to get the process ID
			
 
				+            obs = runtime.run_action(action)
			
 
				+
			
 
				+            if isinstance(obs, CmdOutputObservation) and obs.exit_code == 0:
			
 
				+                pid = obs.content.split()[-1].strip()
			
 
				+                logger.info(
			
 
				+                    f'[{instance_id}] Evaluation process started with PID: {pid}'
			
 
				+                )
			
 
				+
			
 
				+                # Poll for completion
			
 
				+                start_time = time.time()
			
 
				+                timeout = 900  # 15 minutes
			
 
				+                while True:
			
 
				+                    seconds_elapsed = time.time() - start_time
			
 
				+                    if seconds_elapsed > timeout:
			
 
				+                        logger.info(
			
 
				+                            f'[{instance_id}] Evaluation timed out after {timeout} seconds'
			
 
				+                        )
			
 
				+                        break
			
 
				+                    check_action = CmdRunAction(
			
 
				+                        command=f'ps -p {pid} > /dev/null; echo $?', keep_prompt=False
			
 
				+                    )
			
 
				+                    check_action.timeout = 60
			
 
				+                    check_obs = runtime.run_action(check_action)
			
 
				+                    if (
			
 
				+                        isinstance(check_obs, CmdOutputObservation)
			
 
				+                        and check_obs.content.split()[-1].strip() == '1'
			
 
				+                    ):
			
 
				+                        logger.info(
			
 
				+                            f'[{instance_id}] Evaluation process completed after {seconds_elapsed} seconds'
			
 
				+                        )
			
 
				+                        break
			
 
				+                    logger.info(
			
 
				+                        f'[{instance_id}] [{seconds_elapsed:.0f}s] Evaluation still running, waiting...'
			
 
				+                    )
			
 
				+                    time.sleep(30)  # Wait for 30 seconds before checking again
			
 
				+
			
 
				+                # Read the log file
			
 
				+                cat_action = CmdRunAction(command=f'cat {log_file}', keep_prompt=False)
			
 
				+                cat_action.timeout = 300
			
 
				+                cat_obs = runtime.run_action(cat_action)
			
 
				+
			
 
				+                # Grade answer
			
 
				+                if isinstance(cat_obs, CmdOutputObservation) and cat_obs.exit_code == 0:
			
 
				+                    test_output = cat_obs.content
			
 
				+                    assert isinstance(test_output, str)
			
 
				+                    instance['test_result']['test_output'] = test_output
			
 
				+
			
 
				+                    # Get report from test output
			
 
				+                    logger.info(f'[{instance_id}] Grading answer...')
			
 
				+                    with tempfile.TemporaryDirectory() as temp_dir:
			
 
				+                        # Create a directory structure that matches the expected format
			
 
				+                        # NOTE: this is a hack to make the eval report format consistent
			
 
				+                        # with the original SWE-Bench eval script
			
 
				+                        log_dir = os.path.join(temp_dir, 'logs', instance_id)
			
 
				+                        os.makedirs(log_dir, exist_ok=True)
			
 
				+                        test_output_path = os.path.join(log_dir, 'test_output.txt')
			
 
				+                        with open(test_output_path, 'w') as f:
			
 
				+                            f.write(test_output)
			
 
				+
			
 
				+                        _report = get_eval_report(
			
 
				+                            test_spec=test_spec,
			
 
				+                            prediction={
			
 
				+                                'model_patch': model_patch,
			
 
				+                                'instance_id': instance_id,
			
 
				+                            },
			
 
				+                            log_path=test_output_path,
			
 
				+                            include_tests_status=True,
			
 
				+                        )
			
 
				+                        report = _report[instance_id]
			
 
				+                        logger.info(
			
 
				+                            f"[{instance_id}] report: {report}\nResult for {instance_id}: resolved: {report['resolved']}"
			
 
				+                        )
			
 
				+                        instance['test_result']['report']['resolved'] = report[
			
 
				+                            'resolved'
			
 
				+                        ]
			
 
				+            else:
			
 
				+                logger.info(f'[{instance_id}] Error when starting eval:\n{obs.content}')
			
 
				+                instance['test_result']['report']['error_eval'] = True
			
 
				+
			
 
				+            return EvalOutput(
			
 
				+                instance_id=instance_id,
			
 
				+                test_result=instance['test_result'],
			
 
				+            )
			
 
				+        else:
			
 
				+            logger.info(
			
 
				+                f'[{instance_id}] Unexpected output when applying patch:\n{apply_patch_output}'
			
 
				+            )
			
 
				+            raise RuntimeError(
			
 
				+                instance_id,
			
 
				+                f'Unexpected output when applying patch:\n{apply_patch_output}',
			
 
				+                logger,
			
 
				+            )
			
 
				+    finally:
			
 
				+        runtime.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--input-file',
			
 
				+        type=str,
			
 
				+        help='Path to input predictions file',
			
 
				+        required=True,
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--dataset',
			
 
				+        type=str,
			
 
				+        default='princeton-nlp/SWE-bench',
			
 
				+        help='data set to evaluate on, either full-test or lite-test',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--split',
			
 
				+        type=str,
			
 
				+        default='test',
			
 
				+        help='split to evaluate on',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    # Load SWE-Bench dataset
			
 
				+    full_dataset: list[SWEbenchInstance] = load_swebench_dataset(
			
 
				+        args.dataset, args.split
			
 
				+    )
			
 
				+    instance_id_to_instance = {
			
 
				+        instance['instance_id']: instance for instance in full_dataset
			
 
				+    }
			
 
				+    logger.info(
			
 
				+        f'Loaded dataset {args.dataset} with split {args.split} to run inference on.'
			
 
				+    )
			
 
				+
			
 
				+    # Load predictions
			
 
				+    assert args.input_file.endswith('.jsonl'), 'Input file must be a jsonl file.'
			
 
				+    predictions = pd.read_json(args.input_file, lines=True)
			
 
				+    assert (
			
 
				+        'instance_id' in predictions.columns
			
 
				+    ), 'Input file must contain instance_id column.'
			
 
				+
			
 
				+    if 'model_patch' not in predictions.columns and (
			
 
				+        'test_result' in predictions.columns
			
 
				+        and 'model_patch' in predictions['test_result'].iloc[0]
			
 
				+    ):
			
 
				+        raise ValueError(
			
 
				+            'Input file must contain model_patch column OR test_result column with model_patch field.'
			
 
				+        )
			
 
				+    assert len(predictions['instance_id'].unique()) == len(
			
 
				+        predictions
			
 
				+    ), 'instance_id column must be unique.'
			
 
				+
			
 
				+    if 'model_patch' not in predictions.columns:
			
 
				+        predictions['model_patch'] = predictions['test_result'].apply(
			
 
				+            lambda x: x['git_patch']
			
 
				+        )
			
 
				+    assert {'instance_id', 'model_patch'}.issubset(
			
 
				+        set(predictions.columns)
			
 
				+    ), 'Input file must contain instance_id and model_patch columns.'
			
 
				+
			
 
				+    # Merge predictions with dataset
			
 
				+    predictions['instance'] = predictions['instance_id'].apply(
			
 
				+        lambda x: instance_id_to_instance[x]
			
 
				+    )
			
 
				+    predictions['test_spec'] = predictions['instance'].apply(make_test_spec)
			
 
				+
			
 
				+    # Prepare dataset
			
 
				+    output_file = args.input_file.replace('.jsonl', '.swebench_eval.jsonl')
			
 
				+    instances = prepare_dataset(predictions, output_file, args.eval_n_limit)
			
 
				+
			
 
				+    run_evaluation(
			
 
				+        instances,
			
 
				+        metadata=None,
			
 
				+        output_file=output_file,
			
 
				+        num_workers=args.eval_num_workers,
			
 
				+        process_instance_func=process_instance,
			
 
				+    )
			
 
				+
			
 
				+    # Load evaluated predictions & print number of resolved predictions
			
 
				+    evaluated_predictions = pd.read_json(output_file, lines=True)
			
 
				+    fields = ['resolved', 'failed_apply_patch', 'error_eval', 'empty_generation']
			
 
				+
			
 
				+    def count_report_field(row, field):
			
 
				+        return row['test_result']['report'][field]
			
 
				+
			
 
				+    for field in fields:
			
 
				+        count = evaluated_predictions.apply(
			
 
				+            count_report_field, args=(field,), axis=1
			
 
				+        ).sum()
			
 
				+        logger.info(
			
 
				+            f'# {field}: {count} / {len(evaluated_predictions)}. ({count / len(evaluated_predictions):.2%})'
			
 
				+        )
			
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -5,17 +5,23 @@
 
				 BASE_URL="https://api.all-hands.dev/v0"
			
 
				 
			
 
				 # Get the list of runtimes
			
 
				-runtimes=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
			
 
				-  --header "X-API-Key: ${ALLHANDS_API_KEY}" | jq -r '.runtimes | .[].runtime_id')
			
 
				+response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
			
 
				+  --header "X-API-Key: ${ALLHANDS_API_KEY}")
			
 
				 
			
 
				+n_runtimes=$(echo $response | jq -r '.total')
			
 
				+echo "Found ${n_runtimes} runtimes. Stopping them..."
			
 
				+
			
 
				+runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
			
 
				 # Loop through each runtime and stop it
			
 
				-for runtime_id in $runtimes; do
			
 
				-  echo "Stopping runtime: ${runtime_id}"
			
 
				+counter=1
			
 
				+for runtime_id in $runtime_ids; do
			
 
				+  echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
			
 
				   curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
			
 
				     --header "X-API-Key: ${ALLHANDS_API_KEY}" \
			
 
				     --header "Content-Type: application/json" \
			
 
				     --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
			
 
				   echo
			
 
				+  ((counter++))
			
 
				 done
			
 
				 
			
 
				 echo "All runtimes have been stopped."
			
--- a/evaluation/swe_bench/scripts/eval/download_gold_patch.py
+++ b/evaluation/swe_bench/scripts/eval/download_gold_patch.py
@@ -0,0 +1,27 @@
 
				+import argparse
			
 
				+
			
 
				+import pandas as pd
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				+parser = argparse.ArgumentParser()
			
 
				+parser.add_argument('output_filepath', type=str, help='Path to save the output file')
			
 
				+parser.add_argument(
			
 
				+    '--dataset_name',
			
 
				+    type=str,
			
 
				+    help='Name of the dataset to download',
			
 
				+    default='princeton-nlp/SWE-bench_Lite',
			
 
				+)
			
 
				+parser.add_argument('--split', type=str, help='Split to download', default='test')
			
 
				+args = parser.parse_args()
			
 
				+
			
 
				+dataset = load_dataset(args.dataset_name, split=args.split)
			
 
				+output_filepath = args.output_filepath
			
 
				+print(
			
 
				+    f'Downloading gold patches from {args.dataset_name} (split: {args.split}) to {output_filepath}'
			
 
				+)
			
 
				+patches = [
			
 
				+    {'instance_id': row['instance_id'], 'model_patch': row['patch']} for row in dataset
			
 
				+]
			
 
				+print(f'{len(patches)} gold patches loaded')
			
 
				+pd.DataFrame(patches).to_json(output_filepath, lines=True, orient='records')
			
 
				+print(f'Patches saved to {output_filepath}')
			
--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/swe_bench/scripts/eval_infer_remote.sh
@@ -0,0 +1,43 @@
 
				+#!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+INPUT_FILE=$1
			
 
				+NUM_WORKERS=$2
			
 
				+DATASET=$3
			
 
				+SPLIT=$4
			
 
				+
			
 
				+if [ -z "$INPUT_FILE" ]; then
			
 
				+  echo "INPUT_FILE not specified (should be a path to a jsonl file)"
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$DATASET" ]; then
			
 
				+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
			
 
				+  DATASET="princeton-nlp/SWE-bench_Lite"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$SPLIT" ]; then
			
 
				+  echo "SPLIT not specified, use default test"
			
 
				+  SPLIT="test"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$NUM_WORKERS" ]; then
			
 
				+  echo "NUM_WORKERS not specified, use default 1"
			
 
				+  NUM_WORKERS=1
			
 
				+fi
			
 
				+
			
 
				+echo "... Evaluating on $INPUT_FILE ..."
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/swe_bench/eval_infer.py \
			
 
				+  --eval-num-workers $NUM_WORKERS \
			
 
				+  --input-file $INPUT_FILE \
			
 
				+  --dataset $DATASET \
			
 
				+  --split $SPLIT"
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/evaluation/utils/shared.py
+++ b/evaluation/utils/shared.py
@@ -50,15 +50,16 @@ class EvalMetadata(BaseModel):
 
				 class EvalOutput(BaseModel):
			
 
				     # NOTE: User-specified
			
 
				     instance_id: str
			
 
				-    instruction: str
			
 
				     # output of the evaluation
			
 
				     # store anything that is needed for the score calculation
			
 
				     test_result: dict[str, Any]
			
 
				 
			
 
				+    instruction: str | None = None
			
 
				+
			
 
				     # Interaction info
			
 
				-    metadata: EvalMetadata
			
 
				-    history: list[tuple[dict[str, Any], dict[str, Any]]]
			
 
				-    metrics: dict[str, Any]
			
 
				+    metadata: EvalMetadata | None = None
			
 
				+    history: list[tuple[dict[str, Any], dict[str, Any]]] | None = None
			
 
				+    metrics: dict[str, Any] | None = None
			
 
				     error: str | None = None
			
 
				 
			
 
				     # Optionally save the input test instance
			
@@ -66,15 +67,19 @@ class EvalOutput(BaseModel):
 
				 
			
 
				     def model_dump(self, *args, **kwargs):
			
 
				         dumped_dict = super().model_dump(*args, **kwargs)
			
 
				+        # Remove None values
			
 
				+        dumped_dict = {k: v for k, v in dumped_dict.items() if v is not None}
			
 
				         # Apply custom serialization for metadata (to avoid leaking sensitive information)
			
 
				-        dumped_dict['metadata'] = self.metadata.model_dump()
			
 
				+        if self.metadata is not None:
			
 
				+            dumped_dict['metadata'] = self.metadata.model_dump()
			
 
				         return dumped_dict
			
 
				 
			
 
				     def model_dump_json(self, *args, **kwargs):
			
 
				         dumped = super().model_dump_json(*args, **kwargs)
			
 
				         dumped_dict = json.loads(dumped)
			
 
				         # Apply custom serialization for metadata (to avoid leaking sensitive information)
			
 
				-        dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
			
 
				+        if 'metadata' in dumped_dict:
			
 
				+            dumped_dict['metadata'] = json.loads(self.metadata.model_dump_json())
			
 
				         return json.dumps(dumped_dict)
			
 
				 
			
 
				 
			
@@ -260,32 +265,41 @@ def _process_instance_wrapper(
 
				             result = process_instance_func(instance, metadata, use_mp)
			
 
				             return result
			
 
				         except Exception as e:
			
 
				+            error = str(e)
			
 
				+            stacktrace = traceback.format_exc()
			
 
				             if attempt == max_retries:
			
 
				+                logger.exception(e)
			
 
				+                msg = (
			
 
				+                    '-' * 10
			
 
				+                    + '\n'
			
 
				+                    + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
			
 
				+                    + '\n'
			
 
				+                    + f'[Encountered after {max_retries} retries. Please check the logs and report the issue.]'
			
 
				+                    + '-' * 10
			
 
				+                )
			
 
				                 # Raise an error after all retries & stop the evaluation
			
 
				                 raise RuntimeError(
			
 
				                     f'Maximum error retries reached for instance {instance.instance_id}'
			
 
				                 ) from e
			
 
				-            error = str(e)
			
 
				-            stacktrace = traceback.format_exc()
			
 
				             msg = (
			
 
				                 '-' * 10
			
 
				                 + '\n'
			
 
				                 + f'Error in instance [{instance.instance_id}]: {error}. Stacktrace:\n{stacktrace}'
			
 
				                 + '\n'
			
 
				                 + '-' * 10
			
 
				-                + '[This error occurred after maximum retries]'
			
 
				+                + f'[The above error occurred. Retrying... (attempt {attempt + 1} of {max_retries})]'
			
 
				                 + '-' * 10
			
 
				                 + '\n'
			
 
				             )
			
 
				             logger.error(msg)
			
 
				             if use_mp:
			
 
				                 print(msg)  # use print to directly print to console
			
 
				-            time.sleep(1)  # Add a small delay before retrying
			
 
				+            time.sleep(5)
			
 
				 
			
 
				 
			
 
				 def run_evaluation(
			
 
				     dataset: pd.DataFrame,
			
 
				-    metadata: EvalMetadata,
			
 
				+    metadata: EvalMetadata | None,
			
 
				     output_file: str,
			
 
				     num_workers: int,
			
 
				     process_instance_func: Callable[
			
@@ -294,10 +308,14 @@ def run_evaluation(
 
				     max_retries: int = 5,  # number of retries for each instance
			
 
				 ):
			
 
				     use_multiprocessing = num_workers > 1
			
 
				-    logger.info(
			
 
				-        f'Evaluation started with Agent {metadata.agent_class}:\n'
			
 
				-        f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
			
 
				-    )
			
 
				+
			
 
				+    if metadata is not None:
			
 
				+        logger.info(
			
 
				+            f'Evaluation started with Agent {metadata.agent_class}:\n'
			
 
				+            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
			
 
				+        )
			
 
				+    else:
			
 
				+        logger.info(f'Evaluation started with {num_workers} workers.')
			
 
				 
			
 
				     total_instances = len(dataset)
			
 
				     pbar = tqdm(total=total_instances, desc='Instances processed')
			
--- a/openhands/runtime/remote/runtime.py
+++ b/openhands/runtime/remote/runtime.py
@@ -197,6 +197,8 @@ class RemoteRuntime(Runtime):
 
				             # because the runtime might just be starting up
			
 
				             # and have not registered the endpoint yet
			
 
				             retry_fns=[is_404_error],
			
 
				+            # leave enough time for the runtime to start up
			
 
				+            timeout=600,
			
 
				         )
			
 
				         if response.status_code != 200:
			
 
				             msg = f'Runtime is not alive yet (id={self.runtime_id}). Status: {response.status_code}.'