há 1 ano atrás · 12dd3352c5
--- a/evaluation/benchmarks/agent_bench/README.md
+++ b/evaluation/benchmarks/agent_bench/README.md
@@ -36,3 +36,21 @@ You can update the arguments in the script `evaluation/benchmarks/agent_bench/sc
 
				 ```bash
			
 
				 ./evaluation/benchmarks/agent_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 1
			
 
				 ```
			
 
				+
			
 
				+## Run with Remote Runtime (experimental)
			
 
				+
			
 
				+You can run the evaluation using a remote runtime instead of a local Docker container. This is useful when you want to run the evaluation in a cloud environment or when you don't have Docker installed locally.
			
 
				+
			
 
				+To use the remote runtime, set the following environment variables:
			
 
				+
			
 
				+```bash
			
 
				+# Required environment variables
			
 
				+export ALLHANDS_API_KEY="your-api-key"  # Contact the team to get an API key
			
 
				+export RUNTIME=remote
			
 
				+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
			
 
				+
			
 
				+# Run the evaluation
			
 
				+./evaluation/benchmarks/agent_bench/scripts/run_infer.sh llm.eval_gpt4_1106_preview HEAD CodeActAgent 1
			
 
				+```
			
 
				+
			
 
				+The remote runtime will build a container image and run the evaluation in a cloud environment. The results will be saved locally in the same way as when running with a local runtime.
			
--- a/evaluation/benchmarks/agent_bench/run_infer.py
+++ b/evaluation/benchmarks/agent_bench/run_infer.py
@@ -43,12 +43,16 @@ def get_config(
 
				     config = AppConfig(
			
 
				         default_agent=metadata.agent_class,
			
 
				         run_as_openhands=False,
			
 
				-        runtime='eventstream',
			
 
				+        runtime=os.environ.get('RUNTIME', 'eventstream'),
			
 
				         max_iterations=metadata.max_iterations,
			
 
				         sandbox=SandboxConfig(
			
 
				-            base_container_image='python:3.12-bookworm',
			
 
				+            base_container_image='python:3.12-slim',
			
 
				             enable_auto_lint=True,
			
 
				             use_host_network=False,
			
 
				+            api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            keep_runtime_alive=False,
			
 
				+            remote_runtime_init_timeout=3600,
			
 
				         ),
			
 
				         # do not mount workspace
			
 
				         workspace_base=None,