1 year ago · 6f235937cf
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -11,14 +11,17 @@ Create a `config.toml` file if it does not exist at the root of the workspace. P
 
				 
			
 
				 ```bash
			
 
				 export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
			
 
				-./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit]
			
 
				+./evaluation/EDA/scripts/run_infer.sh [model_config] [git-version] [agent] [dataset] [eval_limit]
			
 
				 ```
			
 
				 
			
 
				-where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
			
 
				+where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
			
 
				 
			
 
				 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
@@ -26,11 +29,10 @@ to `CodeActAgent`.
 
				 
			
 
				 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
			
 
				 
			
 
				-Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
			
 
				-then your command would be:
			
 
				+For example,
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things
			
 
				+./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent things
			
 
				 ```
			
 
				 
			
 
				 ## Reference
			
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -1,14 +1,23 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-DATASET=$3
			
 
				-EVAL_LIMIT=$4
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+DATASET=$4
			
 
				+EVAL_LIMIT=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				+get_agent_version
			
 
				+
			
 
				 if [ -z "$DATASET" ]; then
			
 
				   echo "Dataset not specified, use default 'things'"
			
 
				   DATASET="things"
			
@@ -48,3 +57,5 @@ fi
 
				 # Run the command
			
 
				 echo $COMMAND
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/agent_bench/README.md
+++ b/evaluation/agent_bench/README.md
@@ -42,7 +42,7 @@ temperature = 0.0
 
				 ## Start the evaluation
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/agent_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
			
 
				+./evaluation/agent_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
			
 
				 ```
			
 
				 
			
 
				 Following is the basic command to start the evaluation. Here we are only evaluating the `osbench` for now.
			
@@ -56,5 +56,5 @@ You can update the arguments in the script `evaluation/agent_bench/scripts/run_i
 
				 - `--eval-n-limit`: the number of examples to evaluate. For example, `100`.
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo CodeActAgent 1
			
 
				+./evaluation/agent_bench/scripts/run_infer.sh eval_gpt35_turbo 0.6.2 CodeActAgent 1
			
 
				 ```
			
--- a/evaluation/agent_bench/scripts/run_infer.sh
+++ b/evaluation/agent_bench/scripts/run_infer.sh
@@ -1,16 +1,21 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -31,3 +36,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -24,26 +24,29 @@ To reproduce this image, please see the Dockerfile_Opendevin in the `biocoder` r
 
				 
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
			
 
				+./evaluation/biocoder/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
			
 
				 ```
			
 
				 
			
 
				-where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
			
 
				+where `model_config` is mandatory, while `git-version`, `agent`, `dataset` and `eval_limit` are optional.
			
 
				 
			
 
				 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
 
				 - `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
			
 
				 
			
 
				-Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
			
 
				-then your command would be:
			
 
				+Let's say you'd like to run 1 instance using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent
			
 
				+with OpenDevin version 0.6.2, then your command would be:
			
 
				 
			
 
				 ## Examples
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
			
 
				+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 0.6.2 CodeActAgent 1
			
 
				 ```
			
 
				 
			
 
				 ## Reference
			
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -1,18 +1,22 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				 DATASET="biocoder"
			
 
				 
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -35,3 +39,5 @@ fi
 
				 # Run the command
			
 
				 echo $COMMAND
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/bird/README.md
+++ b/evaluation/bird/README.md
@@ -36,11 +36,14 @@ temperature = 0.0
 
				 ## Run Inference on Bird
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview
			
 
				+./evaluation/bird/scripts/run_infer.sh eval_gpt4_1106_preview [model_config] [git-version]
			
 
				 ```
			
 
				 
			
 
				-You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				 
			
 
				 ## Examples
			
 
				 
			
--- a/evaluation/bird/scripts/run_infer.sh
+++ b/evaluation/bird/scripts/run_infer.sh
@@ -1,16 +1,21 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -31,3 +36,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/gaia/README.md
+++ b/evaluation/gaia/README.md
@@ -13,15 +13,18 @@ Please accept the terms and make sure to have logged in on your computer by `hug
 
				 Following is the basic command to start the evaluation. Here we are evaluating on the validation set for the `2023_all` split. You can adjust `./evaluation/gaia/scripts/run_infer.sh` to change the subset you want to evaluate on.
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/gaia/scripts/run_infer.sh [model_config] [agent] [eval_limit] [gaia_subset]
			
 
				-# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300
			
 
				+./evaluation/gaia/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [gaia_subset]
			
 
				+# e.g., ./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 300
			
 
				 ```
			
 
				 
			
 
				-where `model_config` is mandatory, while `agent`, `eval_limit` and `gaia_subset` are optional.
			
 
				+where `model_config` is mandatory, while `git-version`, `agent`, `eval_limit` and `gaia_subset` are optional.
			
 
				 
			
 
				 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`, defaulting to `gpt-3.5-turbo`
			
 
				 
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 - `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
@@ -29,11 +32,10 @@ to `CodeActAgent`.
 
				 
			
 
				 - `gaia_subset`, GAIA benchmark has multiple subsets: `2023_level1`, `2023_level2`, `2023_level3`, `2023_all`, defaulting to `2023_level1`.
			
 
				 
			
 
				-Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and CodeActAgent,
			
 
				-then your command would be:
			
 
				+For example,
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
			
 
				+./evaluation/gaia/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				 ## Get score
			
--- a/evaluation/gaia/scripts/run_infer.sh
+++ b/evaluation/gaia/scripts/run_infer.sh
@@ -1,22 +1,29 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				-LEVELS=$4
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+LEVELS=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				+get_agent_version
			
 
				+
			
 
				 if [ -z "$LEVELS" ]; then
			
 
				   LEVELS="2023_level1"
			
 
				   echo "Levels not specified, use default $LEVELS"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -40,3 +47,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/gorilla/README.md
+++ b/evaluation/gorilla/README.md
@@ -15,7 +15,7 @@ Run `make setup-config` to set up the `config.toml` file if it does not exist at
 
				 Make sure your Docker daemon is running, then run this bash script:
			
 
				 
			
 
				 ```bash
			
 
				-bash evaluation/gorilla/scripts/run_infer.sh [model_config] [agent] [eval_limit] [hubs]
			
 
				+bash evaluation/gorilla/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [hubs]
			
 
				 ```
			
 
				 
			
 
				 where `model_config` is mandatory, while all other arguments are optional.
			
@@ -23,6 +23,9 @@ where `model_config` is mandatory, while all other arguments are optional.
 
				 `model_config`, e.g. `llm`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
@@ -33,9 +36,8 @@ By default, the script evaluates 1 instance.
 
				 
			
 
				 Note: in order to use `eval_limit`, you must also set `agent`; in order to use `hubs`, you must also set `eval_limit`.
			
 
				 
			
 
				-Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `th` test,
			
 
				-then your command would be:
			
 
				+For example,
			
 
				 
			
 
				 ```bash
			
 
				-bash evaluation/gorilla/scripts/run_infer.sh llm CodeActAgent 10 th
			
 
				+bash evaluation/gorilla/scripts/run_infer.sh llm 0.6.2 CodeActAgent 10 th
			
 
				 ```
			
--- a/evaluation/gorilla/scripts/run_infer.sh
+++ b/evaluation/gorilla/scripts/run_infer.sh
@@ -1,23 +1,28 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				-HUBS=$4
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+HUBS=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				+get_agent_version
			
 
				+
			
 
				 if [ -z "$HUBS" ]; then
			
 
				   HUBS="hf,torch,tf"
			
 
				   echo "Hubs not specified, use default $HUBS"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				-
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				 echo "MODEL_CONFIG: $MODEL_CONFIG"
			
@@ -40,3 +45,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/gpqa/README.md
+++ b/evaluation/gpqa/README.md
@@ -55,11 +55,13 @@ temperature = 0.0
 
				 'gpqa_main', 'gqpa_diamond', 'gpqa_experts', 'gpqa_extended' -- data split options
			
 
				 From the root of the OpenDevin repo, run the following command:
			
 
				 ```bash
			
 
				-./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [num_samples_eval] [data_split] [AgentClass]
			
 
				+./evaluation/gpqa/scripts/run_infer.sh [model_config_name] [git-version] [num_samples_eval] [data_split] [AgentClass]
			
 
				 ```
			
 
				 You can replace `model_config_name` with any model you set up in `config.toml`.
			
 
				 
			
 
				 - `model_config_name`: The model configuration name from `config.toml` that you want to evaluate.
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				 - `num_samples_eval`: Number of samples to evaluate (useful for testing and debugging).
			
 
				 - `data_split`: The data split to evaluate on. Must be one of `gpqa_main`, `gqpa_diamond`, `gpqa_experts`, `gpqa_extended`. Defaults to `gpqa_diamond` as done in the paper.
			
 
				 - `AgentClass`: The agent class to use for evaluation. Currently only supports `CodeActAgent` for CodeActAgent.
			
--- a/evaluation/gpqa/scripts/run_infer.sh
+++ b/evaluation/gpqa/scripts/run_infer.sh
@@ -1,8 +1,15 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-EVAL_LIMIT=$2
			
 
				-DATA_SPLIT=$3
			
 
				-AGENT=$4
			
 
				+COMMIT_HASH=$2
			
 
				+EVAL_LIMIT=$3
			
 
				+DATA_SPLIT=$4
			
 
				+AGENT=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent ..."
			
@@ -15,9 +22,7 @@ if [ -z "$DATA_SPLIT" ]; then
 
				   DATA_SPLIT="gpqa_diamond"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -39,3 +44,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/humanevalfix/scripts/run_infer.sh
+++ b/evaluation/humanevalfix/scripts/run_infer.sh
@@ -1,7 +1,12 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				 
			
 
				 echo "
			
 
				 ################################################################################
			
@@ -41,14 +46,14 @@ fi
 
				 
			
 
				 # ################################################################################
			
 
				 
			
 
				+checkout_eval_branch
			
 
				+
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -69,3 +74,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/logic_reasoning/README.md
+++ b/evaluation/logic_reasoning/README.md
@@ -29,7 +29,9 @@ temperature = 0.0
 
				 ```
			
 
				 
			
 
				 ## Run Inference on logic_reasoning
			
 
				-The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o.
			
 
				+The following code will run inference on the first example of the ProntoQA dataset with model gpt-4o,
			
 
				+using OpenDevin 0.6.2 version.
			
 
				+
			
 
				 ```bash
			
 
				-./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 1
			
 
				+./evaluation/logic_reasoning/scripts/run_infer.sh ProntoQA gpt-4o 0.6.2 1
			
 
				 ```
			
--- a/evaluation/logic_reasoning/scripts/run_infer.sh
+++ b/evaluation/logic_reasoning/scripts/run_infer.sh
@@ -1,19 +1,24 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 DATASET=$1
			
 
				 MODEL_CONFIG=$2
			
 
				-EVAL_LIMIT=$3
			
 
				-AGENT=$4
			
 
				+COMMIT_HASH=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+AGENT=$5
			
 
				 
			
 
				 # ################################################################################
			
 
				 
			
 
				+checkout_eval_branch
			
 
				+
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -35,3 +40,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/miniwob/scripts/run_infer.sh
+++ b/evaluation/miniwob/scripts/run_infer.sh
@@ -1,4 +1,7 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				 
			
 
				 # configure miniwob website, change URL to yours
			
 
				 export MINIWOB_URL="file:///home/fangzhex/miniwob-plusplus/miniwob/html/miniwob/"
			
@@ -9,18 +12,19 @@ export USE_CONCISE_ANSWER="true"
 
				 
			
 
				 
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-NOTE=$3
			
 
				-EVAL_LIMIT=$4
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+NOTE=$4
			
 
				+EVAL_LIMIT=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default BrowsingAgent"
			
 
				   AGENT="BrowsingAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -42,3 +46,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/mint/README.md
+++ b/evaluation/mint/README.md
@@ -13,24 +13,26 @@ We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/da
 
				 Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
			
 
				+./evaluation/mint/scripts/run_infer.sh [model_config] [git-version] [subset] [eval_limit]
			
 
				 ```
			
 
				 
			
 
				-where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
			
 
				+where `model_config` is mandatory, while others are optional.
			
 
				 
			
 
				 - `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+- `git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 - `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`. It can be either: `math`, `gsm8k`, `mmlu`, `theoremqa`, `mbpp`,`humaneval`.
			
 
				 
			
 
				 - `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
			
 
				 
			
 
				 Note: in order to use `eval_limit`, you must also set `subset`.
			
 
				 
			
 
				-Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
			
 
				-then your command would be:
			
 
				+For example,
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 gsm8k 3
			
 
				 ```
			
 
				 
			
 
				 ## Reference
			
--- a/evaluation/mint/scripts/run_infer.sh
+++ b/evaluation/mint/scripts/run_infer.sh
@@ -1,13 +1,19 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				 
			
 
				 MODEL_CONFIG=$1
			
 
				-SUBSET=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+SUBSET=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+
			
 
				+checkout_eval_branch
			
 
				+
			
 
				 # Only 'CodeActAgent' is supported for MINT now
			
 
				 AGENT="CodeActAgent"
			
 
				 
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -36,3 +42,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/ml_bench/README.md
+++ b/evaluation/ml_bench/README.md
@@ -47,8 +47,8 @@ temperature = 0.0
 
				 To run the evaluation on the ML-Bench dataset, use the following command:
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/ml_bench/scripts/run_infer.sh [model_config] [split] [agent] [eval_limit]
			
 
				-# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview full CodeActAgent 10
			
 
				+./evaluation/ml_bench/scripts/run_infer.sh [model_config] [git-version] [split] [agent] [eval_limit]
			
 
				+# e.g., ./evaluation/ml_bench/scripts/run_infer.sh eval_gpt4_1106_preview 0.6.2 full CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				 You can replace `eval_gpt4_1106_preview` with any model you set up in `config.toml`.
			
--- a/evaluation/ml_bench/scripts/run_infer.sh
+++ b/evaluation/ml_bench/scripts/run_infer.sh
@@ -1,9 +1,15 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				 
			
 
				 MODEL_CONFIG=$1
			
 
				-SPLIT=$2
			
 
				-AGENT=$3
			
 
				-EVAL_LIMIT=$4
			
 
				+COMMIT_HASH=$2
			
 
				+SPLIT=$3
			
 
				+AGENT=$4
			
 
				+EVAL_LIMIT=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$MODEL_CONFIG" ]; then
			
 
				   echo "Model config not specified, use default"
			
@@ -15,9 +21,7 @@ if [ -z "$AGENT" ]; then
 
				   AGENT="CodeActAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -42,3 +46,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -82,8 +82,8 @@ If you see an error, please make sure your `config.toml` contains all
 
				 ## Run Inference on SWE-Bench Instances
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/swe_bench/scripts/run_infer.sh [model_config] [agent] [eval_limit]
			
 
				-# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 300
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit]
			
 
				+# e.g., ./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 300
			
 
				 ```
			
 
				 
			
 
				 where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
			
@@ -91,6 +91,9 @@ where `model_config` is mandatory, while `agent` and `eval_limit` are optional.
 
				 `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
@@ -102,7 +105,7 @@ Let's say you'd like to run 10 instances using `eval_gpt4_1106_preview` and Code
 
				 then your command would be:
			
 
				 
			
 
				 ```bash
			
 
				-./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview CodeActAgent 10
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview head CodeActAgent 10
			
 
				 ```
			
 
				 
			
 
				 If you would like to specify a list of tasks you'd like to benchmark on, you could
			
--- a/evaluation/swe_bench/scripts/run_infer.sh
+++ b/evaluation/swe_bench/scripts/run_infer.sh
@@ -1,8 +1,15 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				-MAX_ITER=$4
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+MAX_ITER=$5
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
@@ -14,9 +21,7 @@ if [ -z "$MAX_ITER" ]; then
 
				   MAX_ITER=30
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -50,3 +55,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/toolqa/README.md
+++ b/evaluation/toolqa/README.md
@@ -15,7 +15,7 @@ Run `make setup-config` to set up the `config.toml` file if it does not exist at
 
				 Make sure your Docker daemon is running, then run this bash script:
			
 
				 
			
 
				 ```bash
			
 
				-bash evaluation/toolqa/scripts/run_infer.sh [model_config] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
			
 
				+bash evaluation/toolqa/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
			
 
				 ```
			
 
				 
			
 
				 where `model_config` is mandatory, while all other arguments are optional.
			
@@ -23,6 +23,9 @@ where `model_config` is mandatory, while all other arguments are optional.
 
				 `model_config`, e.g. `llm`, is the config group name for your
			
 
				 LLM settings, as defined in your `config.toml`.
			
 
				 
			
 
				+`git-version`, e.g. `head`, is the git commit hash of the OpenDevin version you would
			
 
				+like to evaluate. It could also be a release tag like `0.6.2`.
			
 
				+
			
 
				 `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				 to `CodeActAgent`.
			
 
				 
			
--- a/evaluation/toolqa/scripts/run_infer.sh
+++ b/evaluation/toolqa/scripts/run_infer.sh
@@ -1,10 +1,17 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				+
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				-DATASET=$4
			
 
				-HARDNESS=$5
			
 
				-WOLFRAM_APPID=$6
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+DATASET=$5
			
 
				+HARDNESS=$6
			
 
				+WOLFRAM_APPID=$7
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default CodeActAgent"
			
@@ -26,9 +33,7 @@ if [ -z "$WOLFRAM_APPID" ]; then
 
				   echo "WOLFRAM_APPID not specified"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -56,3 +61,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch
			
--- a/evaluation/utils/version_control.sh
+++ b/evaluation/utils/version_control.sh
@@ -0,0 +1,34 @@
 
				+checkout_eval_branch() {
			
 
				+    if [ -z "$COMMIT_HASH" ]; then
			
 
				+        echo "Commit hash not specified, use current git commit"
			
 
				+        return 0
			
 
				+    fi
			
 
				+    echo "Start to checkout opendevin version to $COMMIT_HASH, but keep current evaluation harness"
			
 
				+    if ! git diff-index --quiet HEAD --; then
			
 
				+        echo "There are uncommitted changes, please stash or commit them first"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    current_branch=$(git rev-parse --abbrev-ref HEAD)
			
 
				+    echo "Current version is: $current_branch"
			
 
				+    echo "Check out OpenDevin to version: $COMMIT_HASH"
			
 
				+    if ! git checkout $COMMIT_HASH; then
			
 
				+        echo "Failed to check out to $COMMIT_HASH"
			
 
				+        exit 1
			
 
				+    fi
			
 
				+    echo "Revert changes in evaluation folder"
			
 
				+    git checkout $current_branch -- evaluation
			
 
				+}
			
 
				+
			
 
				+checkout_original_branch() {
			
 
				+    if [ -z "$current_branch" ]; then
			
 
				+        return 0
			
 
				+    fi
			
 
				+    echo "Checkout back to original branch $current_branch"
			
 
				+    git checkout $current_branch
			
 
				+}
			
 
				+
			
 
				+get_agent_version() {
			
 
				+    # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				+    # We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				+    AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+}
			
--- a/evaluation/webarena/scripts/run_infer.sh
+++ b/evaluation/webarena/scripts/run_infer.sh
@@ -1,4 +1,7 @@
 
				 #!/bin/bash
			
 
				+set -eo pipefail
			
 
				+
			
 
				+source "evaluation/utils/version_control.sh"
			
 
				 
			
 
				 # configure webarena websites and environment
			
 
				 source evaluation/webarena/scripts/webarena_env.sh
			
@@ -8,17 +11,18 @@ export USE_NAV="false"
 
				 export USE_CONCISE_ANSWER="true"
			
 
				 
			
 
				 MODEL_CONFIG=$1
			
 
				-AGENT=$2
			
 
				-EVAL_LIMIT=$3
			
 
				+COMMIT_HASH=$2
			
 
				+AGENT=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+
			
 
				+checkout_eval_branch
			
 
				 
			
 
				 if [ -z "$AGENT" ]; then
			
 
				   echo "Agent not specified, use default BrowsingAgent"
			
 
				   AGENT="BrowsingAgent"
			
 
				 fi
			
 
				 
			
 
				-# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				-# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				-AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+get_agent_version
			
 
				 
			
 
				 echo "AGENT: $AGENT"
			
 
				 echo "AGENT_VERSION: $AGENT_VERSION"
			
@@ -40,3 +44,5 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+checkout_original_branch