1 год назад · dd1095cf6b
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -122,10 +122,6 @@ folder.
 
				 
			
 
				 ### Known Issues
			
 
				 
			
 
				-Sometimes you might see transient errors like `pexpect.pxssh.ExceptionPxssh: Could not establish connection to host`.
			
 
				-The regenerate.sh script doesn't know this is a transient error and would still regenerate the test artifacts. You could simply
			
 
				-terminate the script by `ctrl+c` and rerun the script.
			
 
				-
			
 
				 The test framework cannot handle non-determinism. If anything in the prompt (including
			
 
				 observed result after executing an action) is non-deterministic (e.g. a PID), the
			
 
				 test would fail. In this case, you might want to change conftest.py to filter out
			
--- a/tests/integration/regenerate.sh
+++ b/tests/integration/regenerate.sh
@@ -5,6 +5,8 @@ set -eo pipefail
 
				 ##           CONSTANTS AND ENVIRONMENTAL VARIABLES          ##
			
 
				 ##############################################################
			
 
				 
			
 
				+TMP_FILE="${TMP_FILE:-tmp.log}"
			
 
				+
			
 
				 if [ -z $WORKSPACE_MOUNT_PATH ]; then
			
 
				   WORKSPACE_MOUNT_PATH=$(pwd)
			
 
				 fi
			
@@ -18,8 +20,10 @@ WORKSPACE_MOUNT_PATH_IN_SANDBOX="/workspace"
 
				 
			
 
				 mkdir -p $WORKSPACE_BASE
			
 
				 
			
 
				-# use environmental variable if exist, otherwise use "ssh"
			
 
				+# use environmental variable if exists, otherwise use "ssh"
			
 
				 SANDBOX_TYPE="${SANDBOX_TYPE:-ssh}"
			
 
				+# TODO: we should also test PERSIST_SANDBOX = true, once it's fixed
			
 
				+PERSIST_SANDBOX=false
			
 
				 MAX_ITERATIONS=10
			
 
				 
			
 
				 agents=("DelegatorAgent" "ManagerAgent" "BrowsingAgent" "MonologueAgent" "CodeActAgent" "PlannerAgent" "CodeActSWEAgent")
			
@@ -57,23 +61,67 @@ run_test() {
 
				   fi
			
 
				 
			
 
				   SANDBOX_TYPE=$SANDBOX_TYPE \
			
 
				+    PERSIST_SANDBOX=$PERSIST_SANDBOX \
			
 
				     WORKSPACE_BASE=$WORKSPACE_BASE \
			
 
				     WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
			
 
				     WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
			
 
				     MAX_ITERATIONS=$MAX_ITERATIONS \
			
 
				     AGENT=$agent \
			
 
				-    $pytest_cmd
			
 
				-    # return exit code of pytest
			
 
				-    return $?
			
 
				+    $pytest_cmd 2>&1 | tee $TMP_FILE
			
 
				+
			
 
				+  # Capture the exit code of pytest
			
 
				+  pytest_exit_code=${PIPESTATUS[0]}
			
 
				+
			
 
				+  if grep -q "docker.errors.DockerException" $TMP_FILE; then
			
 
				+    echo "Error: docker.errors.DockerException found in the output. Exiting."
			
 
				+    echo "Please check if your Docker daemon is running!"
			
 
				+    exit 1
			
 
				+  fi
			
 
				+
			
 
				+  if grep -q "tenacity.RetryError" $TMP_FILE; then
			
 
				+    echo "Error: tenacity.RetryError found in the output. Exiting."
			
 
				+    echo "This is mostly a transient error. Please retry."
			
 
				+    exit 1
			
 
				+  fi
			
 
				+
			
 
				+  if grep -q "ExceptionPxssh" $TMP_FILE; then
			
 
				+    echo "Error: ExceptionPxssh found in the output. Exiting."
			
 
				+    echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
			
 
				+    exit 1
			
 
				+  fi
			
 
				+
			
 
				+  if grep -q "Address already in use" $TMP_FILE; then
			
 
				+    echo "Error: Address already in use found in the output. Exiting."
			
 
				+    echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
			
 
				+    exit 1
			
 
				+  fi
			
 
				+
			
 
				+  # Return the exit code of pytest
			
 
				+  return $pytest_exit_code
			
 
				 }
			
 
				 
			
 
				 # browsing capability needs a local http server
			
 
				 launch_http_server() {
			
 
				   poetry run python tests/integration/start_http_server.py &
			
 
				   HTTP_SERVER_PID=$!
			
 
				+  echo "Test http server launched, PID = $HTTP_SERVER_PID"
			
 
				   sleep 10
			
 
				 }
			
 
				 
			
 
				+cleanup() {
			
 
				+  echo "Cleaning up before exit..."
			
 
				+  if [ -n "$HTTP_SERVER_PID" ]; then
			
 
				+    echo "Killing HTTP server..."
			
 
				+    kill $HTTP_SERVER_PID
			
 
				+    unset HTTP_SERVER_PID
			
 
				+  fi
			
 
				+  [ -f $TMP_FILE ] && rm $TMP_FILE
			
 
				+  echo "Cleanup done!"
			
 
				+}
			
 
				+
			
 
				+# Trap the EXIT signal to run the cleanup function
			
 
				+trap cleanup EXIT
			
 
				+
			
 
				 # generate prompts again, using existing LLM responses under tests/integration/mock/[agent]/[test_name]/response_*.log
			
 
				 # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
			
 
				 # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
			
@@ -82,6 +130,7 @@ regenerate_without_llm() {
 
				   # set -x to print the command being executed
			
 
				   set -x
			
 
				   SANDBOX_TYPE=$SANDBOX_TYPE \
			
 
				+    PERSIST_SANDBOX=$PERSIST_SANDBOX \
			
 
				     WORKSPACE_BASE=$WORKSPACE_BASE \
			
 
				     WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
			
 
				     WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
			
@@ -110,6 +159,7 @@ regenerate_with_llm() {
 
				   echo -e "/exit\n" | \
			
 
				     DEBUG=true \
			
 
				     SANDBOX_TYPE=$SANDBOX_TYPE \
			
 
				+    PERSIST_SANDBOX=$PERSIST_SANDBOX \
			
 
				     WORKSPACE_BASE=$WORKSPACE_BASE \
			
 
				     WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
			
 
				     WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
			
@@ -122,12 +172,6 @@ regenerate_with_llm() {
 
				   mkdir -p tests/integration/mock/$agent/$test_name/
			
 
				   mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
			
 
				 
			
 
				-  if [[ "$test_name" = "test_browse_internet" ]]; then
			
 
				-    # Terminate the HTTP server
			
 
				-    kill $HTTP_SERVER_PID
			
 
				-  fi
			
 
				-
			
 
				-
			
 
				 }
			
 
				 
			
 
				 ##############################################################