Browse Source

[Evaluation]: Log openhands version in eval output folder, instead of agent version (#5394)

Xingyao Wang 1 year ago
parent
commit
9908e1b285

+ 3 - 6
evaluation/benchmarks/EDA/scripts/run_infer.sh

@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$DATASET" ]; then
   echo "Dataset not specified, use default 'things'"
@@ -34,12 +34,9 @@ if [ -z "$OPENAI_API_KEY" ]; then
   exit 1
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
@@ -51,7 +48,7 @@ COMMAND="poetry run python evaluation/benchmarks/EDA/run_infer.py \
   --max-iterations 20 \
   --OPENAI_API_KEY $OPENAI_API_KEY \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/agent_bench/scripts/run_infer.sh

@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poetry run python evaluation/benchmarks/agent_bench/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="export PYTHONPATH=evaluation/benchmarks/agent_bench:\$PYTHONPATH && poe
   --llm-config $MODEL_CONFIG \
   --max-iterations 30 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/aider_bench/scripts/run_infer.sh

@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION
 
 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then

+ 3 - 3
evaluation/benchmarks/biocoder/scripts/run_infer.sh

@@ -21,10 +21,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 
@@ -33,7 +33,7 @@ COMMAND="poetry run python evaluation/benchmarks/biocoder/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${DATASET}"
+  --eval-note ${OPENHANDS_VERSION}_${DATASET}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/bird/scripts/run_infer.sh

@@ -20,10 +20,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
@@ -31,7 +31,7 @@ COMMAND="poetry run python evaluation/benchmarks/bird/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 5 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/browsing_delegation/scripts/run_infer.sh

@@ -20,13 +20,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 
 COMMAND="poetry run python evaluation/benchmarks/browsing_delegation/run_infer.py \
   --agent-cls $AGENT \

+ 3 - 3
evaluation/benchmarks/commit0_bench/scripts/run_infer.sh

@@ -61,10 +61,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HF SPLIT: $SPLIT"
@@ -75,7 +75,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
   export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
   EVAL_NOTE="$EVAL_NOTE-no-hint"

+ 3 - 3
evaluation/benchmarks/discoverybench/scripts/run_infer.sh

@@ -23,10 +23,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
@@ -35,7 +35,7 @@ COMMAND="poetry run python evaluation/benchmarks/discoverybench/run_infer.py \
   --max-iterations 10 \
   --max-chars 10000000 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 4 - 4
evaluation/benchmarks/gaia/scripts/run_infer.sh

@@ -21,17 +21,17 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$LEVELS" ]; then
   LEVELS="2023_level1"
   echo "Levels not specified, use default $LEVELS"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "LEVELS: $LEVELS"
 
@@ -42,7 +42,7 @@ COMMAND="poetry run python ./evaluation/benchmarks/gaia/run_infer.py \
   --level $LEVELS \
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/gorilla/scripts/run_infer.sh

@@ -21,7 +21,7 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 if [ -z "$HUBS" ]; then
   HUBS="hf,torch,tf"
@@ -29,7 +29,7 @@ if [ -z "$HUBS" ]; then
 fi
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "HUBS: $HUBS"
 
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/gorilla/run_infer.py \
   --hubs $HUBS \
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/gpqa/scripts/run_infer.sh

@@ -27,10 +27,10 @@ if [ -z "$DATA_SPLIT" ]; then
   DATA_SPLIT="gpqa_diamond"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
@@ -39,7 +39,7 @@ COMMAND="poetry run python evaluation/benchmarks/gpqa/run_infer.py \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
   --data-split $DATA_SPLIT \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/humanevalfix/scripts/run_infer.sh

@@ -58,10 +58,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
@@ -69,7 +69,7 @@ COMMAND="poetry run python evaluation/benchmarks/humanevalfix/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/logic_reasoning/scripts/run_infer.sh

@@ -28,10 +28,10 @@ if [ -z "$DATASET" ]; then
   DATASET="ProofWriter"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
@@ -40,7 +40,7 @@ COMMAND="poetry run python evaluation/benchmarks/logic_reasoning/run_infer.py \
   --dataset $DATASET \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/miniwob/scripts/run_infer.sh

@@ -25,13 +25,13 @@ if [ -z "$AGENT" ]; then
   AGENT="BrowsingAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
+EVAL_NOTE="${OPENHANDS_VERSION}_${NOTE}"
 
 COMMAND="export PYTHONPATH=evaluation/benchmarks/miniwob:\$PYTHONPATH && poetry run python evaluation/benchmarks/miniwob/run_infer.py \
   --agent-cls $AGENT \

+ 2 - 2
evaluation/benchmarks/mint/scripts/run_infer.sh

@@ -18,10 +18,10 @@ checkout_eval_branch
 # Only 'CodeActAgent' is supported for MINT now
 AGENT="CodeActAgent"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 
 export PYTHONPATH=$(pwd)
 

+ 3 - 3
evaluation/benchmarks/ml_bench/scripts/run_infer.sh

@@ -26,10 +26,10 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
@@ -37,7 +37,7 @@ COMMAND="poetry run python evaluation/benchmarks/ml_bench/run_infer.py \
   --llm-config $MODEL_CONFIG \
   --max-iterations 10 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION"
+  --eval-note $OPENHANDS_VERSION"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/scienceagentbench/scripts/run_infer.sh

@@ -26,10 +26,10 @@ if [ -z "$USE_KNOWLEDGE" ]; then
   USE_KNOWLEDGE=false
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
 COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py \
@@ -38,7 +38,7 @@ COMMAND="poetry run python evaluation/benchmarks/scienceagentbench/run_infer.py
   --use_knowledge $USE_KNOWLEDGE \
   --max-iterations 30 \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note $AGENT_VERSION" \
+  --eval-note $OPENHANDS_VERSION" \
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/swe_bench/scripts/run_infer.sh

@@ -55,10 +55,10 @@ echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
 export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
 echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "SPLIT: $SPLIT"
@@ -68,7 +68,7 @@ if [ -z "$USE_HINT_TEXT" ]; then
   export USE_HINT_TEXT=false
 fi
 echo "USE_HINT_TEXT: $USE_HINT_TEXT"
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 # if not using Hint, add -no-hint to the eval note
 if [ "$USE_HINT_TEXT" = false ]; then
   EVAL_NOTE="$EVAL_NOTE-no-hint"

+ 3 - 3
evaluation/benchmarks/toolqa/scripts/run_infer.sh

@@ -38,10 +38,10 @@ if [ -z "$WOLFRAM_APPID" ]; then
   echo "WOLFRAM_APPID not specified"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 echo "DATASET: $DATASET"
 echo "HARDNESS: $HARDNESS"
@@ -56,7 +56,7 @@ COMMAND="poetry run python evaluation/benchmarks/toolqa/run_infer.py \
   --wolfram_alpha_appid $WOLFRAM_APPID\
   --data-split validation \
   --eval-num-workers $NUM_WORKERS \
-  --eval-note ${AGENT_VERSION}_${LEVELS}"
+  --eval-note ${OPENHANDS_VERSION}_${LEVELS}"
 
 if [ -n "$EVAL_LIMIT" ]; then
   echo "EVAL_LIMIT: $EVAL_LIMIT"

+ 3 - 3
evaluation/benchmarks/webarena/scripts/run_infer.sh

@@ -27,13 +27,13 @@ if [ -z "$AGENT" ]; then
   AGENT="BrowsingAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE="$AGENT_VERSION"
+EVAL_NOTE="$OPENHANDS_VERSION"
 
 COMMAND="poetry run python evaluation/benchmarks/webarena/run_infer.py \
   --agent-cls $AGENT \

+ 3 - 3
evaluation/integration_tests/scripts/run_infer.sh

@@ -21,13 +21,13 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-get_agent_version
+get_openhands_version
 
 echo "AGENT: $AGENT"
-echo "AGENT_VERSION: $AGENT_VERSION"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 
-EVAL_NOTE=$AGENT_VERSION
+EVAL_NOTE=$OPENHANDS_VERSION
 
 # Default to NOT use unit tests.
 if [ -z "$USE_UNIT_TESTS" ]; then

+ 2 - 2
evaluation/utils/version_control.sh

@@ -39,8 +39,8 @@ checkout_original_branch() {
     git checkout $current_branch
 }
 
-get_agent_version() {
+get_openhands_version() {
     # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
     # We need to track the version of Agent in the evaluation to make sure results are comparable
-    AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+    OPENHANDS_VERSION=v$(poetry run python -c "from openhands import get_version; print(get_version())")
 }