Эх сурвалжийг харах

Evaluation time travel: build sandbox on the fly (#2491)

Boxuan Li 1 жил өмнө
parent
commit
feabc97aba

+ 0 - 2
evaluation/EDA/scripts/run_infer.sh

@@ -57,5 +57,3 @@ fi
 # Run the command
 echo $COMMAND
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/agent_bench/scripts/run_infer.sh

@@ -36,5 +36,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/biocoder/scripts/run_infer.sh

@@ -39,5 +39,3 @@ fi
 # Run the command
 echo $COMMAND
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/bird/scripts/run_infer.sh

@@ -36,5 +36,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/gaia/scripts/run_infer.sh

@@ -47,5 +47,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/gorilla/scripts/run_infer.sh

@@ -45,5 +45,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/gpqa/scripts/run_infer.sh

@@ -44,5 +44,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/humanevalfix/scripts/run_infer.sh

@@ -74,5 +74,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/logic_reasoning/scripts/run_infer.sh

@@ -40,5 +40,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/miniwob/scripts/run_infer.sh

@@ -46,5 +46,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/mint/scripts/run_infer.sh

@@ -42,5 +42,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/ml_bench/scripts/run_infer.sh

@@ -46,5 +46,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 0 - 2
evaluation/toolqa/scripts/run_infer.sh

@@ -61,5 +61,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch

+ 20 - 0
evaluation/utils/version_control.sh

@@ -1,8 +1,16 @@
 checkout_eval_branch() {
     if [ -z "$COMMIT_HASH" ]; then
         echo "Commit hash not specified, use current git commit"
+        build_sandbox
         return 0
     fi
+
+    if git diff --quiet $COMMIT_HASH HEAD; then
+        echo "The given hash is equivalent to the current HEAD"
+        build_sandbox
+        return 0
+    fi
+
     echo "Start to checkout opendevin version to $COMMIT_HASH, but keep current evaluation harness"
     if ! git diff-index --quiet HEAD --; then
         echo "There are uncommitted changes, please stash or commit them first"
@@ -15,8 +23,20 @@ checkout_eval_branch() {
         echo "Failed to check out to $COMMIT_HASH"
         exit 1
     fi
+
     echo "Revert changes in evaluation folder"
     git checkout $current_branch -- evaluation
+
+    # Trap the EXIT signal to checkout original branch
+    trap checkout_original_branch EXIT
+
+    build_sandbox
+}
+
+build_sandbox() {
+    echo "Build sandbox locally"
+    docker build -t eval-sandbox -f containers/sandbox/Dockerfile /tmp
+    export SANDBOX_CONTAINER_IMAGE="eval-sandbox"
 }
 
 checkout_original_branch() {

+ 0 - 2
evaluation/webarena/scripts/run_infer.sh

@@ -44,5 +44,3 @@ fi
 
 # Run the command
 eval $COMMAND
-
-checkout_original_branch