1 vuosi sitten · 852c90f64a
--- a/evaluation/miniwob/README.md
+++ b/evaluation/miniwob/README.md
@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 
				 ./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
			
 
				 ```
			
 
				 
			
 
				+### Run Inference on `RemoteRuntime` (experimental)
			
 
				+
			
 
				+This is in limited beta. Contact Xingyao over slack if you want to try this out!
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
			
 
				+
			
 
				+# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
			
 
				+export ALLHANDS_API_KEY="YOUR-API-KEY"
			
 
				+export RUNTIME=remote
			
 
				+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
			
 
				+./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
			
 
				+```
			
 
				+
			
 
				 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
			
 
				 
			
 
				 To calculate the average reward, run:
			
--- a/evaluation/miniwob/__init__.py
+++ b/evaluation/miniwob/__init__.py
--- a/evaluation/miniwob/get_avg_reward.py
+++ b/evaluation/miniwob/get_avg_reward.py
@@ -23,7 +23,7 @@ if __name__ == '__main__':
 
				             data = json.loads(line)
			
 
				             actual_num += 1
			
 
				             total_cost += data['metrics']['accumulated_cost']
			
 
				-            total_reward += data['test_result']
			
 
				+            total_reward += data['test_result']['reward']
			
 
				 
			
 
				     avg_reward = total_reward / total_num
			
 
				     print('Avg Reward: ', avg_reward)
			
--- a/evaluation/miniwob/run_infer.py
+++ b/evaluation/miniwob/run_infer.py
@@ -47,6 +47,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
 
				 
			
 
				 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				     'CodeActAgent': codeact_user_response,
			
 
				+    'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
			
 
				 }
			
 
				 
			
 
				 
			
@@ -66,6 +67,7 @@ def get_config(
 
				             browsergym_eval_env=env_id,
			
 
				             api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            remote_runtime_init_timeout=1800,
			
 
				             keep_runtime_alive=False,
			
 
				             timeout=120,
			
 
				             remote_runtime_init_timeout=1800,
			
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
				 
			
 
				 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
			
 
				 
			
 
				-COMMAND="poetry run python evaluation/miniwob/run_infer.py \
			
 
				+COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
			
 
				   --agent-cls $AGENT \
			
 
				   --llm-config $MODEL_CONFIG \
			
 
				   --max-iterations 10 \