Browse Source

[fix eval] Fix issues with miniwob remote runtime evaluation (#5001)

Ketan Ramaneti 1 year ago
parent
commit
852c90f64a

+ 14 - 0
evaluation/miniwob/README.md

@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
 ./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
 ```
 
+### Run Inference on `RemoteRuntime` (experimental)
+
+This is in limited beta. Contact Xingyao over slack if you want to try this out!
+
+```bash
+./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
+
+# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
+export ALLHANDS_API_KEY="YOUR-API-KEY"
+export RUNTIME=remote
+export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
+./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
+```
+
 Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
 
 To calculate the average reward, run:

+ 0 - 0
evaluation/miniwob/__init__.py


+ 1 - 1
evaluation/miniwob/get_avg_reward.py

@@ -23,7 +23,7 @@ if __name__ == '__main__':
             data = json.loads(line)
             actual_num += 1
             total_cost += data['metrics']['accumulated_cost']
-            total_reward += data['test_result']
+            total_reward += data['test_result']['reward']
 
     avg_reward = total_reward / total_num
     print('Avg Reward: ', avg_reward)

+ 2 - 0
evaluation/miniwob/run_infer.py

@@ -47,6 +47,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
+    'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
 }
 
 
@@ -66,6 +67,7 @@ def get_config(
             browsergym_eval_env=env_id,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
+            remote_runtime_init_timeout=1800,
             keep_runtime_alive=False,
             timeout=120,
             remote_runtime_init_timeout=1800,

+ 1 - 1
evaluation/miniwob/scripts/run_infer.sh

@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
 
-COMMAND="poetry run python evaluation/miniwob/run_infer.py \
+COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
   --agent-cls $AGENT \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \