1 an în urmă · 42b49e6c43
--- a/evaluation/aider_bench/README.md
+++ b/evaluation/aider_bench/README.md
@@ -56,6 +56,20 @@ You can update the arguments in the script
 
				 ./evaluation/aider_bench/scripts/run_infer.sh eval_gpt35_turbo HEAD CodeActAgent 100 1 "1,3,10"
			
 
				 ```
			
 
				 
			
 
				+### Run Inference on `RemoteRuntime` (experimental)
			
 
				+
			
 
				+This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/aider_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [eval-num-workers] [eval_ids]
			
 
				+
			
 
				+# Example - This runs evaluation on CodeActAgent for 133 instances on aider_bench test set, with 2 workers running in parallel
			
 
				+export ALLHANDS_API_KEY="YOUR-API-KEY"
			
 
				+export RUNTIME=remote
			
 
				+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
			
 
				+./evaluation/aider_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 133 2
			
 
				+```
			
 
				+
			
 
				 ## Summarize Results
			
 
				 
			
 
				 ```bash
			
--- a/evaluation/aider_bench/run_infer.py
+++ b/evaluation/aider_bench/run_infer.py
@@ -58,6 +58,9 @@ def get_config(
 
				             use_host_network=False,
			
 
				             timeout=100,
			
 
				             api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            keep_runtime_alive=False,
			
 
				+            remote_runtime_init_timeout=1800,
			
 
				         ),
			
 
				         # do not mount workspace
			
 
				         workspace_base=None,
			
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -67,6 +67,8 @@ def get_config(
 
				             api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				             keep_runtime_alive=False,
			
 
				+            timeout=120,
			
 
				+            remote_runtime_init_timeout=1800,
			
 
				         ),
			
 
				         # do not mount workspace
			
 
				         workspace_base=None,