1 year ago · db4e1dbbec
--- a/.gitignore
+++ b/.gitignore
@@ -174,6 +174,7 @@ evaluation/bird/data
 
				 evaluation/gaia/data
			
 
				 evaluation/gorilla/data
			
 
				 evaluation/toolqa/data
			
 
				+evaluation/scienceagentbench/benchmark
			
 
				 
			
 
				 # frontend
			
 
				 
			
--- a/evaluation/scienceagentbench/Dockerfile
+++ b/evaluation/scienceagentbench/Dockerfile
@@ -0,0 +1,17 @@
 
				+FROM python:3.11-bookworm
			
 
				+
			
 
				+
			
 
				+# For OpenHands agents to explore the dataset directories, please download the full benchmark [here](https://buckeyemailosu-my.sharepoint.com/:u:/g/personal/chen_8336_buckeyemail_osu_edu/EQuA6uJ3CtRHvRfZ2GiN1tYBRVJE4DSUD10MW61fr7HuSQ?e=sCBegG) and unzip it with password `scienceagentbench`.
			
 
				+# **Please DO NOT redistribute the unzipped data files online.**
			
 
				+# It will download a benchmark.zip file to the current directory.
			
 
				+# unzip it and put the benchmark folder under evaluation/scienceagentbench/
			
 
				+
			
 
				+RUN mkdir -p /benchmark
			
 
				+COPY benchmark /benchmark
			
 
				+
			
 
				+RUN mkdir -p /workspace
			
 
				+WORKDIR /workspace
			
 
				+
			
 
				+# pushd evaluation/scienceagentbench
			
 
				+# docker build -t xingyaoww/openhands-eval-scienceagentbench .
			
 
				+# popd
			
--- a/evaluation/scienceagentbench/Dockerfile.evaluator
+++ b/evaluation/scienceagentbench/Dockerfile.evaluator
@@ -0,0 +1,25 @@
 
				+FROM mambaorg/micromamba:debian12
			
 
				+
			
 
				+USER root
			
 
				+# For https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#code-generation-with-agents
			
 
				+
			
 
				+RUN micromamba create -n sci-agent-eval python=3.10 pip setuptools wheel
			
 
				+RUN micromamba run -n sci-agent-eval pip install pip-tools
			
 
				+
			
 
				+RUN mkdir -p /workspace
			
 
				+WORKDIR /workspace
			
 
				+
			
 
				+RUN apt-get update && apt-get install -y git
			
 
				+
			
 
				+RUN git clone https://github.com/OSU-NLP-Group/ScienceAgentBench.git /workspace/
			
 
				+RUN git checkout 4eddc7db6449a5ade3e37285747c8b208cd54ce7
			
 
				+
			
 
				+RUN micromamba create -n sci-agent python=3.10 pip setuptools wheel
			
 
				+RUN micromamba run -n sci-agent pip install -r requirements.txt
			
 
				+
			
 
				+# Replace all occurence of conda with micromamba under the /workspace
			
 
				+RUN find ./ -type f -exec sed -i 's/conda/micromamba/g' {} \;
			
 
				+
			
 
				+# pushd evaluation/scienceagentbench
			
 
				+# docker build -t xingyaoww/openhands-eval-scienceagentbench-evaluator -f Dockerfile.evaluator .
			
 
				+# popd
			
--- a/evaluation/scienceagentbench/README.md
+++ b/evaluation/scienceagentbench/README.md
@@ -0,0 +1,54 @@
 
				+# ScienceAgentBench Evaluation with OpenHands
			
 
				+
			
 
				+This folder contains the evaluation harness for [ScienceAgentBench](https://osu-nlp-group.github.io/ScienceAgentBench/) (paper: https://arxiv.org/abs/2410.05080).
			
 
				+
			
 
				+## Setup Environment and LLM Configuration
			
 
				+
			
 
				+Please follow instruction [here](../README.md#setup) to setup your local development environment and LLM.
			
 
				+
			
 
				+## Setup ScienceAgentBench
			
 
				+
			
 
				+To prevent benchmark data contamination, we only provide the annotation sheet on [Huggingface](https://huggingface.co/datasets/osunlp/ScienceAgentBench), which includes all necessary *inputs* to run an agent.
			
 
				+
			
 
				+## Run Inference on ScienceAgentBench
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/scienceagentbench/scripts/run_infer.sh [model_config] [git-version] [use_knowledge] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
			
 
				+
			
 
				+# Example
			
 
				+./evaluation/scienceagentbench/scripts/run_infer.sh llm.eval_gpt4o 0.9.3
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, and the rest are optional.
			
 
				+
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+- `git-version`, e.g. `HEAD`, is the git commit hash of the OpenHands version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+- `use_knowledge`, e.g. `true`, specifies whether allowing the agent to use expert-provided knowledge as additional input or not. By default, it is set to `false`.
			
 
				+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By
			
 
				+default, the script evaluates the entire SWE-bench_Lite test set (300 issues). Note:
			
 
				+in order to use `eval_limit`, you must also set `agent`.
			
 
				+- `max_iter`, e.g. `20`, is the maximum number of iterations for the agent to run. By
			
 
				+default, it is set to 30.
			
 
				+- `num_workers`, e.g. `3`, is the number of parallel workers to run the evaluation. By
			
 
				+default, it is set to 1.
			
 
				+
			
 
				+## Evaluate Generated Programs
			
 
				+
			
 
				+### Extract Necessary Information from OpenHands Log
			
 
				+
			
 
				+After the inference is completed, you may use the following command to extract necessary information from the output log for evaluation:
			
 
				+
			
 
				+```bash
			
 
				+python post_proc.py [log_fname]
			
 
				+```
			
 
				+- `log_fname`, e.g. `evaluation/.../output.jsonl`, is the automatically saved trajectory log of an OpenHands agent.
			
 
				+
			
 
				+Output will be write to e.g. `evaluation/.../output.converted.jsonl`
			
 
				+
			
 
				+### Run evaluation
			
 
				+
			
 
				+Please follow the steps [here](https://github.com/OSU-NLP-Group/ScienceAgentBench/tree/main?tab=readme-ov-file#evaluation-of-generated-code) to evaluate the generated programs.
			
--- a/evaluation/scienceagentbench/post_proc.py
+++ b/evaluation/scienceagentbench/post_proc.py
@@ -0,0 +1,30 @@
 
				+import json
			
 
				+from argparse import ArgumentParser
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = ArgumentParser()
			
 
				+    parser.add_argument(
			
 
				+        'log_fname',
			
 
				+        type=str,
			
 
				+    )
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    fname = args.log_fname
			
 
				+    out_fname = args.log_fname.replace('.jsonl', '.converted.jsonl')
			
 
				+
			
 
				+    log = [json.loads(line) for line in open(fname)]
			
 
				+
			
 
				+    simple_log = [
			
 
				+        json.dumps(
			
 
				+            {
			
 
				+                'instance_id': ex['instance_id'],
			
 
				+                'instruction': ex['instruction'],
			
 
				+                'test_result': ex['test_result'],
			
 
				+                'cost': ex['metrics']['accumulated_cost'],
			
 
				+            }
			
 
				+        )
			
 
				+        for ex in log
			
 
				+    ]
			
 
				+
			
 
				+    with open(out_fname, 'w+', encoding='utf-8') as f:
			
 
				+        f.write('\n'.join(simple_log))
			
--- a/evaluation/scienceagentbench/run_infer.py
+++ b/evaluation/scienceagentbench/run_infer.py
@@ -0,0 +1,292 @@
 
				+import asyncio
			
 
				+import os
			
 
				+from typing import Any
			
 
				+
			
 
				+import pandas as pd
			
 
				+from datasets import load_dataset
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from evaluation.utils.shared import (
			
 
				+    EvalMetadata,
			
 
				+    EvalOutput,
			
 
				+    codeact_user_response,
			
 
				+    make_metadata,
			
 
				+    prepare_dataset,
			
 
				+    reset_logger_for_multiprocessing,
			
 
				+    run_evaluation,
			
 
				+)
			
 
				+from openhands.controller.state.state import State
			
 
				+from openhands.core.config import (
			
 
				+    AppConfig,
			
 
				+    SandboxConfig,
			
 
				+    get_llm_config_arg,
			
 
				+    get_parser,
			
 
				+)
			
 
				+from openhands.core.logger import openhands_logger as logger
			
 
				+from openhands.core.main import create_runtime, run_controller
			
 
				+from openhands.events.action import CmdRunAction, MessageAction
			
 
				+from openhands.events.observation import CmdOutputObservation
			
 
				+from openhands.runtime.base import Runtime
			
 
				+from openhands.utils.async_utils import call_async_from_sync
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+}
			
 
				+
			
 
				+LOCAL_DATASET_PATH = os.path.join(os.path.dirname(__file__), 'benchmark')
			
 
				+
			
 
				+
			
 
				+def format_task_dict(example, use_knowledge):
			
 
				+    task = {
			
 
				+        'instance_id': example['instance_id'],
			
 
				+        'task_inst': example['task_inst'],
			
 
				+        'dataset_path': '/benchmark/datasets/'
			
 
				+        + example['dataset_folder_tree'].split('\n')[0][4:],
			
 
				+        'dataset_folder_tree': example['dataset_folder_tree'],
			
 
				+        'dataset_preview': example['dataset_preview'],
			
 
				+        'pred_program_name': 'pred_' + example['gold_program_name'],
			
 
				+    }
			
 
				+
			
 
				+    if use_knowledge:
			
 
				+        task['task_inst'] += '\n' + str(example['domain_knowledge'])
			
 
				+
			
 
				+    return task
			
 
				+
			
 
				+
			
 
				+def get_config(
			
 
				+    metadata: EvalMetadata,
			
 
				+    instance_id: str,
			
 
				+) -> AppConfig:
			
 
				+    config = AppConfig(
			
 
				+        default_agent=metadata.agent_class,
			
 
				+        run_as_openhands=False,
			
 
				+        runtime=os.environ.get('RUNTIME', 'eventstream'),
			
 
				+        max_budget_per_task=4,
			
 
				+        max_iterations=metadata.max_iterations,
			
 
				+        sandbox=SandboxConfig(
			
 
				+            base_container_image='docker.io/xingyaoww/openhands-eval-scienceagentbench',
			
 
				+            enable_auto_lint=True,
			
 
				+            use_host_network=False,
			
 
				+            timeout=300,
			
 
				+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            keep_remote_runtime_alive=False,
			
 
				+        ),
			
 
				+        # do not mount workspace
			
 
				+        workspace_base=None,
			
 
				+        workspace_mount_path=None,
			
 
				+    )
			
 
				+    config.set_llm_config(metadata.llm_config)
			
 
				+    if metadata.llm_config.log_completions:
			
 
				+        metadata.llm_config.log_completions_folder = os.path.join(
			
 
				+            metadata.eval_output_dir, 'llm_completions', instance_id
			
 
				+        )
			
 
				+        logger.info(
			
 
				+            f'Logging LLM completions for instance {instance_id} to '
			
 
				+            f'{metadata.llm_config.log_completions_folder}'
			
 
				+        )
			
 
				+    return config
			
 
				+
			
 
				+
			
 
				+def initialize_runtime(
			
 
				+    runtime: Runtime,
			
 
				+    instance: pd.Series,  # this argument is not required
			
 
				+):
			
 
				+    """Initialize the runtime for the agent.
			
 
				+
			
 
				+    This function is called before the runtime is used to run the agent.
			
 
				+    """
			
 
				+    logger.info(f"{'-' * 50} BEGIN Runtime Initialization Fn {'-' * 50}")
			
 
				+    obs: CmdOutputObservation
			
 
				+
			
 
				+    # Set up workspace directories
			
 
				+    action = CmdRunAction(command='mkdir -p /workspace/pred_programs')
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    assert obs.exit_code == 0
			
 
				+
			
 
				+    action = CmdRunAction(command='mkdir -p /workspace/pred_results')
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+    assert obs.exit_code == 0
			
 
				+
			
 
				+    dataset_name = instance['dataset_folder_tree'].split('\n')[0][4:].rstrip('/')
			
 
				+
			
 
				+    # Copy the dataset to the workspace
			
 
				+    dataset_dir = os.path.join(
			
 
				+        LOCAL_DATASET_PATH,
			
 
				+        'datasets',
			
 
				+        dataset_name,
			
 
				+    )
			
 
				+    runtime.copy_to(dataset_dir, '/workspace/benchmark/datasets', recursive=True)
			
 
				+
			
 
				+    # Check the dataset exists
			
 
				+    action = CmdRunAction(
			
 
				+        command='cd /workspace/benchmark/datasets && ls',
			
 
				+        keep_prompt=False,
			
 
				+    )
			
 
				+    obs = runtime.run_action(action)
			
 
				+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
			
 
				+    assert obs.exit_code == 0
			
 
				+    assert dataset_name in obs.content
			
 
				+
			
 
				+    logger.info(f"{'-' * 50} END Runtime Initialization Fn {'-' * 50}")
			
 
				+
			
 
				+
			
 
				+def complete_runtime(
			
 
				+    runtime: Runtime,
			
 
				+    instance: pd.Series,
			
 
				+) -> dict[str, Any]:
			
 
				+    """Complete the runtime for the agent.
			
 
				+
			
 
				+    This function is called before the runtime is used to run the agent.
			
 
				+    If you need to do something in the sandbox to get the correctness metric after
			
 
				+    the agent has run, modify this function.
			
 
				+    """
			
 
				+    logger.info(f"{'-' * 50} BEGIN Runtime Completion Fn {'-' * 50}")
			
 
				+    obs: CmdOutputObservation
			
 
				+
			
 
				+    test_result = {}
			
 
				+
			
 
				+    action = CmdRunAction(command='cd /workspace')
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+
			
 
				+    assert obs.exit_code == 0
			
 
				+
			
 
				+    action = CmdRunAction(
			
 
				+        command=f'cat pred_programs/{instance.pred_program_name}',
			
 
				+        keep_prompt=False,
			
 
				+    )
			
 
				+    logger.info(action, extra={'msg_type': 'ACTION'})
			
 
				+    obs = runtime.run_action(action)
			
 
				+
			
 
				+    if obs.exit_code == 0:
			
 
				+        test_result = {'program': obs.content}
			
 
				+    else:
			
 
				+        test_result = {'program': 'ERROR'}
			
 
				+
			
 
				+    logger.info(f"{'-' * 50} END Runtime Completion Fn {'-' * 50}")
			
 
				+    return test_result
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance: pd.Series,
			
 
				+    metadata: EvalMetadata,
			
 
				+    reset_logger: bool = True,
			
 
				+) -> EvalOutput:
			
 
				+    instance_id = instance.instance_id.replace('/', '__')
			
 
				+    config = get_config(metadata, instance_id)
			
 
				+
			
 
				+    # Set up the logger properly, so you can run multi-processing to parallelize the evaluation
			
 
				+    if reset_logger:
			
 
				+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
			
 
				+        reset_logger_for_multiprocessing(logger, instance_id, log_dir)
			
 
				+    else:
			
 
				+        logger.info(f'Starting evaluation for instance {instance_id}.')
			
 
				+
			
 
				+    instruction = f"""You are an expert Python programming assistant that helps scientist users to write high-quality code to solve their tasks.
			
 
				+Given a user request, you are expected to write a complete program that accomplishes the requested task and save any outputs to `/workspace/pred_results/` in the correct format.
			
 
				+
			
 
				+Here's the user request you need to work on:
			
 
				+{instance.task_inst}
			
 
				+
			
 
				+You can access the dataset at `{instance.dataset_path}`. Here is the directory structure of the dataset:
			
 
				+```
			
 
				+{instance.dataset_folder_tree}
			
 
				+```
			
 
				+Here are some helpful previews for the dataset file(s):
			
 
				+{instance.dataset_preview}
			
 
				+
			
 
				+Please save your program as `/workspace/pred_programs/{instance.pred_program_name}`.
			
 
				+Then, please run the program to check and fix any errors.
			
 
				+Please do NOT run the program in the background.
			
 
				+If the program uses some packages that are incompatible, please figure out alternative implementations and do NOT restart the environment.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+    runtime = create_runtime(config)
			
 
				+    call_async_from_sync(runtime.connect)
			
 
				+    initialize_runtime(runtime, instance)
			
 
				+
			
 
				+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+    state: State | None = asyncio.run(
			
 
				+        run_controller(
			
 
				+            config=config,
			
 
				+            initial_user_action=MessageAction(content=instruction),
			
 
				+            runtime=runtime,
			
 
				+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
			
 
				+                metadata.agent_class
			
 
				+            ),
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				+    # ======= Attempt to evaluate the agent's edits =======
			
 
				+    test_result = complete_runtime(runtime, instance)
			
 
				+
			
 
				+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
			
 
				+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+
			
 
				+    # history is now available as a stream of events, rather than list of pairs of (Action, Observation)
			
 
				+    # for compatibility with the existing output format, we can remake the pairs here
			
 
				+    # remove when it becomes unnecessary
			
 
				+    histories = state.history.compatibility_for_eval_history_pairs()
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = EvalOutput(
			
 
				+        instance_id=instance.instance_id,
			
 
				+        instruction=instruction,
			
 
				+        metadata=metadata,
			
 
				+        history=histories,
			
 
				+        metrics=metrics,
			
 
				+        error=state.last_error if state and state.last_error else None,
			
 
				+        test_result=test_result,
			
 
				+    )
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--use_knowledge',
			
 
				+        type=str,
			
 
				+        default='false',
			
 
				+        choices=['true', 'false'],
			
 
				+        help='use expert-provided knowledge or not',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    sab_dataset = load_dataset('osunlp/ScienceAgentBench', split='validation')
			
 
				+
			
 
				+    dataset_processed = []
			
 
				+    for example in tqdm(sab_dataset):
			
 
				+        dataset_processed.append(
			
 
				+            format_task_dict(example, args.use_knowledge == 'true')
			
 
				+        )
			
 
				+
			
 
				+    dataset = pd.DataFrame(dataset_processed)
			
 
				+
			
 
				+    llm_config = None
			
 
				+    if args.llm_config:
			
 
				+        llm_config = get_llm_config_arg(args.llm_config)
			
 
				+    if llm_config is None:
			
 
				+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
			
 
				+
			
 
				+    metadata = make_metadata(
			
 
				+        llm_config,
			
 
				+        'ScienceAgentBench',
			
 
				+        args.agent_cls,
			
 
				+        args.max_iterations,
			
 
				+        args.eval_note,
			
 
				+        args.eval_output_dir,
			
 
				+    )
			
 
				+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
			
 
				+    dataset['instance_id'] = dataset['instance_id'].apply(str)
			
 
				+    instances = prepare_dataset(dataset, output_file, args.eval_n_limit)
			
 
				+
			
 
				+    run_evaluation(
			
 
				+        instances, metadata, output_file, args.eval_num_workers, process_instance
			
 
				+    )
			
--- a/evaluation/scienceagentbench/scripts/run_infer.sh
+++ b/evaluation/scienceagentbench/scripts/run_infer.sh
@@ -0,0 +1,49 @@
 
				+#!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				+MODEL_CONFIG=$1
			
 
				+COMMIT_HASH=$2
			
 
				+USE_KNOWLEDGE=$3
			
 
				+AGENT=$4
			
 
				+EVAL_LIMIT=$5
			
 
				+NUM_WORKERS=$6
			
 
				+
			
 
				+if [ -z "$NUM_WORKERS" ]; then
			
 
				+  NUM_WORKERS=1
			
 
				+  echo "Number of workers not specified, use default $NUM_WORKERS"
			
 
				+fi
			
 
				+checkout_eval_branch
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$USE_KNOWLEDGE" ]; then
			
 
				+  echo "Use knowledge not specified, use default False"
			
 
				+  USE_KNOWLEDGE=false
			
 
				+fi
			
 
				+
			
 
				+get_agent_version
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/scienceagentbench/run_infer.py \
			
 
				+  --agent-cls $AGENT \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --use_knowledge $USE_KNOWLEDGE \
			
 
				+  --max-iterations 30 \
			
 
				+  --eval-num-workers $NUM_WORKERS \
			
 
				+  --eval-note $AGENT_VERSION" \
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/openhands/runtime/impl/e2b/sandbox.py
+++ b/openhands/runtime/impl/e2b/sandbox.py
@@ -4,9 +4,7 @@ import tarfile
 
				 from glob import glob
			
 
				 
			
 
				 from e2b import Sandbox as E2BSandbox
			
 
				-from e2b.sandbox.exception import (
			
 
				-    TimeoutException,
			
 
				-)
			
 
				+from e2b.sandbox.exception import TimeoutException
			
 
				 
			
 
				 from openhands.core.config import SandboxConfig
			
 
				 from openhands.core.logger import openhands_logger as logger