浏览代码

Evaluation time travel: allow evaluation on a specific version (#2356)

* Time travel for evaluation

* Fix source script path

* Exit script if given version doesn't exist

* Exit on failure

* Update README

* Change scripts of all other benchmarks

* Modify README files

* Fix logic_reasoning README
Boxuan Li 1 年之前
父节点
当前提交
6f235937cf

+ 7 - 5
evaluation/EDA/README.md

@@ -11,14 +11,17 @@ Create a `config.toml` file if it does not exist at the root of the workspace. P
 
 ```bash
 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
-./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit]
+./evaluation/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
 ```
 
-where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
+where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
 
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 
@@ -26,11 +29,10 @@ to `CodeActAgent`.
 
 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
 
-Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
-then your command would be:
+For example,
 
 ```bash
-./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things
+./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
 ```
 
 ## Reference

+ 14 - 3
evaluation/EDA/scripts/run_infer.sh

@@ -1,14 +1,23 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-DATASET=$3
-EVAL_LIMIT=$4
+COMMIT_HASH=$2
+AGENT=$3
+DATASET=$4
+EVAL_LIMIT=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
+get_agent_version
+
 if [ -z "$DATASET" ]; then
   echo "Dataset not specified, use default 'things'"
   DATASET="things"
@@ -48,3 +57,5 @@ fi
 # Run the command
 echo $COMMAND
 eval $COMMAND
+
+checkout_original_branch

+ 2 - 2
evaluation/agent_bench/README.md

@@ -42,7 +42,7 @@ temperature = 0.0
 ## Start the evaluation
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
 Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
@@ -56,5 +56,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
 
 ```bash
-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1
+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
 ```

+ 12 - 5
evaluation/agent_bench/scripts/run_infer.sh

@@ -1,16 +1,21 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -31,3 +36,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 8 - 5
evaluation/biocoder/README.md

@@ -24,26 +24,29 @@ To reproduce this image, please see the Dockerfile_Opendevin in the `biocoder` r
 
 
 ```bash
-./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
+./evaluation/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
 ```
 
-where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
+where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
 
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 
 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
 
-Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
-then your command would be:
+Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
+with OpenDevin version 0.6.2, then your command would be:
 
 ## Examples
 
 ```bash
-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
 ```
 
 ## Reference

+ 11 - 5
evaluation/biocoder/scripts/run_infer.sh

@@ -1,18 +1,22 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
 DATASET="biocoder"
 
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -35,3 +39,5 @@ fi
 # Run the command
 echo $COMMAND
 eval $COMMAND
+
+checkout_original_branch

+ 5 - 2
evaluation/bird/README.md

@@ -36,11 +36,14 @@ temperature = 0.0
 ## Run Inference on Bird
 
 ```bash
-./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview
+./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview [model_config] [git-version]
 ```
 
-You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
+LLM settings, as defined in your `config.toml`.
 
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
 
 ## Examples
 

+ 12 - 5
evaluation/bird/scripts/run_infer.sh

@@ -1,16 +1,21 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -31,3 +36,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 8 - 6
evaluation/gaia/README.md

@@ -13,15 +13,18 @@ Please accept the terms and make sure to have logged in on your computer by `hug
 Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
 
 ```bash
-./evaluation/gaia/scripts/run_infer.sh [model_config] [agent] [eval_limit] [gaia_subset]
-# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300
+./evaluation/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
+# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
 ```
 
-where `model_config` is mandatory, while `agent`, `eval_limit` and `gaia_subset` are optional.
+where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional.
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
 LLM settings, as defined in your `config.toml`, defaulting to `gpt-3.5-turbo`
 
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 
@@ -29,11 +32,10 @@ to `CodeActAgent`.
 
 - `gaia_subset`, GAIA benchmark has multiple subsets: `2023_level1`, `2023_level2`, `2023_level3`, `2023_all`, defaulting to `2023_level1`.
 
-Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and CodeActAgent,
-then your command would be:
+For example,
 
 ```bash
-./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
+./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
 ```
 
 ## Get score

+ 15 - 6
evaluation/gaia/scripts/run_infer.sh

@@ -1,22 +1,29 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
-LEVELS=$4
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+LEVELS=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
+get_agent_version
+
 if [ -z "$LEVELS" ]; then
   LEVELS="2023_level1"
   echo "Levels not specified, use default $LEVELS"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -40,3 +47,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 6 - 4
evaluation/gorilla/README.md

@@ -15,7 +15,7 @@ Run `make setup-config` to set up the `config.toml` file if it does not exist at
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh [model_config] [agent] [eval_limit] [hubs]
+bash evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -23,6 +23,9 @@ where `model_config` is mandatory, while all other arguments are optional.
 `model_config`, e.g. `llm`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
 
+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 
@@ -33,9 +36,8 @@ By default, the script evaluates 1 instance.
 
 Note: in order to use `eval_limit`, you must also set `agent`; in order to use `hubs`, you must also set `eval_limit`.
 
-Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `th` test,
-then your command would be:
+For example,
 
 ```bash
-bash evaluation/gorilla/scripts/run_infer.sh llm CodeActAgent 10 th
+bash evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
 ```

+ 14 - 7
evaluation/gorilla/scripts/run_infer.sh

@@ -1,23 +1,28 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
-HUBS=$4
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+HUBS=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
+get_agent_version
+
 if [ -z "$HUBS" ]; then
   HUBS="hf,torch,tf"
   echo "Hubs not specified, use default $HUBS"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
-
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
@@ -40,3 +45,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 3 - 1
evaluation/gpqa/README.md

@@ -55,11 +55,13 @@ temperature = 0.0
 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
 From the root of the OpenDevin repo, run the following command:
 ```bash
-./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [num_samples_eval] [data_split] [AgentClass]
+./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
 ```
 You can replace `model_config_name` with any model you set up in `config.toml`.
 
 - `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
 - `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
 - `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
 - `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.

+ 13 - 6
evaluation/gpqa/scripts/run_infer.sh

@@ -1,8 +1,15 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-EVAL_LIMIT=$2
-DATA_SPLIT=$3
-AGENT=$4
+COMMIT_HASH=$2
+EVAL_LIMIT=$3
+DATA_SPLIT=$4
+AGENT=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent ..."
@@ -15,9 +22,7 @@ if [ -z "$DATA_SPLIT" ]; then
   DATA_SPLIT="gpqa_diamond"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -39,3 +44,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 12 - 5
evaluation/humanevalfix/scripts/run_infer.sh

@@ -1,7 +1,12 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
 
 echo "
 ################################################################################
@@ -41,14 +46,14 @@ fi
 
 # ################################################################################
 
+checkout_eval_branch
+
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -69,3 +74,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 4 - 2
evaluation/logic_reasoning/README.md

@@ -29,7 +29,9 @@ temperature = 0.0
 ```
 
 ## Run Inference on logic_reasoning
-The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
+The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o,
+using OpenDevin 0.6.2 version.
+
 ```bash
-./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 0.6.2 1
 ```

+ 12 - 5
evaluation/logic_reasoning/scripts/run_infer.sh

@@ -1,19 +1,24 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 DATASET=$1
 MODEL_CONFIG=$2
-EVAL_LIMIT=$3
-AGENT=$4
+COMMIT_HASH=$3
+EVAL_LIMIT=$4
+AGENT=$5
 
 # ################################################################################
 
+checkout_eval_branch
+
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -35,3 +40,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 12 - 6
evaluation/miniwob/scripts/run_infer.sh

@@ -1,4 +1,7 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
 
 # configure miniwob website, change URL to yours
 export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
@@ -9,18 +12,19 @@ export USE_CONCISE_ANSWER="true"
 
 
 MODEL_CONFIG=$1
-AGENT=$2
-NOTE=$3
-EVAL_LIMIT=$4
+COMMIT_HASH=$2
+AGENT=$3
+NOTE=$4
+EVAL_LIMIT=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default BrowsingAgent"
   AGENT="BrowsingAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -42,3 +46,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 7 - 5
evaluation/mint/README.md

@@ -13,24 +13,26 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da
 Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
 
 ```bash
-./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
+./evaluation/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
 ```
 
-where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
+where `model_config` is mandatory, while others are optional.
 
 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
 
+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 - `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`. It can be either: `math`, `gsm8k`, `mmlu`, `theoremqa`, `mbpp`,`humaneval`.
 
 - `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
 
 Note: in order to use `eval_limit`, you must also set `subset`.
 
-Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
-then your command would be:
+For example,
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
 ```
 
 ## Reference

+ 12 - 4
evaluation/mint/scripts/run_infer.sh

@@ -1,13 +1,19 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
 
 MODEL_CONFIG=$1
-SUBSET=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+SUBSET=$3
+EVAL_LIMIT=$4
+
+checkout_eval_branch
+
 # Only 'CodeActAgent' is supported for MINT now
 AGENT="CodeActAgent"
 
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -36,3 +42,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 2 - 2
evaluation/ml_bench/README.md

@@ -47,8 +47,8 @@ temperature = 0.0
 To run the evaluation on the ML-Bench dataset, use the following command:
 
 ```bash
-./evaluation/ml_bench/scripts/run_infer.sh [model_config] [split] [agent] [eval_limit]
-# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview full CodeActAgent 10
+./evaluation/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
+# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
 ```
 
 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.

+ 12 - 6
evaluation/ml_bench/scripts/run_infer.sh

@@ -1,9 +1,15 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
 
 MODEL_CONFIG=$1
-SPLIT=$2
-AGENT=$3
-EVAL_LIMIT=$4
+COMMIT_HASH=$2
+SPLIT=$3
+AGENT=$4
+EVAL_LIMIT=$5
+
+checkout_eval_branch
 
 if [ -z "$MODEL_CONFIG" ]; then
   echo "Model config not specified, use default"
@@ -15,9 +21,7 @@ if [ -z "$AGENT" ]; then
   AGENT="CodeActAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -42,3 +46,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 6 - 3
evaluation/swe_bench/README.md

@@ -82,8 +82,8 @@ If you see an error, please make sure your `config.toml` contains all
 ## Run Inference on SWE-Bench Instances
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300
+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 300
 ```
 
 where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
@@ -91,6 +91,9 @@ where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
 `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
 
+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 
@@ -102,7 +105,7 @@ Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and Code
 then your command would be:
 
 ```bash
-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 10
 ```
 
 If you would like to specify a list of tasks you'd like to benchmark on, you could

+ 13 - 6
evaluation/swe_bench/scripts/run_infer.sh

@@ -1,8 +1,15 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
-MAX_ITER=$4
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+MAX_ITER=$5
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
@@ -14,9 +21,7 @@ if [ -z "$MAX_ITER" ]; then
   MAX_ITER=30
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -50,3 +55,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 4 - 1
evaluation/toolqa/README.md

@@ -15,7 +15,7 @@ Run `make setup-config` to set up the `config.toml` file if it does not exist at
 Make sure your Docker daemon is running, then run this bash script:
 
 ```bash
-bash evaluation/toolqa/scripts/run_infer.sh [model_config] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
+bash evaluation/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
 ```
 
 where `model_config` is mandatory, while all other arguments are optional.
@@ -23,6 +23,9 @@ where `model_config` is mandatory, while all other arguments are optional.
 `model_config`, e.g. `llm`, is the config group name for your
 LLM settings, as defined in your `config.toml`.
 
+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
+like to evaluate. It could also be a release tag like `0.6.2`.
+
 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
 to `CodeActAgent`.
 

+ 15 - 8
evaluation/toolqa/scripts/run_infer.sh

@@ -1,10 +1,17 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
-DATASET=$4
-HARDNESS=$5
-WOLFRAM_APPID=$6
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+DATASET=$5
+HARDNESS=$6
+WOLFRAM_APPID=$7
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default CodeActAgent"
@@ -26,9 +33,7 @@ if [ -z "$WOLFRAM_APPID" ]; then
   echo "WOLFRAM_APPID not specified"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -56,3 +61,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch

+ 34 - 0
evaluation/utils/version_control.sh

@@ -0,0 +1,34 @@
+checkout_eval_branch() {
+    if [ -z "$COMMIT_HASH" ]; then
+        echo "Commit hash not specified, use current git commit"
+        return 0
+    fi
+    echo "Start to checkout opendevin version to $COMMIT_HASH, but keep current evaluation harness"
+    if ! git diff-index --quiet HEAD --; then
+        echo "There are uncommitted changes, please stash or commit them first"
+        exit 1
+    fi
+    current_branch=$(git rev-parse --abbrev-ref HEAD)
+    echo "Current version is: $current_branch"
+    echo "Check out OpenDevin to version: $COMMIT_HASH"
+    if ! git checkout $COMMIT_HASH; then
+        echo "Failed to check out to $COMMIT_HASH"
+        exit 1
+    fi
+    echo "Revert changes in evaluation folder"
+    git checkout $current_branch -- evaluation
+}
+
+checkout_original_branch() {
+    if [ -z "$current_branch" ]; then
+        return 0
+    fi
+    echo "Checkout back to original branch $current_branch"
+    git checkout $current_branch
+}
+
+get_agent_version() {
+    # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
+    # We need to track the version of Agent in the evaluation to make sure results are comparable
+    AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+}

+ 11 - 5
evaluation/webarena/scripts/run_infer.sh

@@ -1,4 +1,7 @@
 #!/bin/bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
 
 # configure webarena websites and environment
 source evaluation/webarena/scripts/webarena_env.sh
@@ -8,17 +11,18 @@ export USE_NAV="false"
 export USE_CONCISE_ANSWER="true"
 
 MODEL_CONFIG=$1
-AGENT=$2
-EVAL_LIMIT=$3
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+
+checkout_eval_branch
 
 if [ -z "$AGENT" ]; then
   echo "Agent not specified, use default BrowsingAgent"
   AGENT="BrowsingAgent"
 fi
 
-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
-# We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+get_agent_version
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"
@@ -40,3 +44,5 @@ fi
 
 # Run the command
 eval $COMMAND
+
+checkout_original_branch