| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407 |
- #!/bin/bash
- set -eo pipefail
- ##############################################################
- ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
- ##############################################################
- echo -e "\n\n============================================================"
- # unset environmental variables that might disturb testing
- unset OPENAI_API_KEY
- unset SANDBOX_ENV_OPENAI_API_KEY
- unset OPENAI_BASE_URL
- unset OPENAI_MODEL
- # Get the absolute path of the script directory
- get_script_dir() {
- local source="${BASH_SOURCE[0]}"
- while [ -h "$source" ]; do
- local dir="$( cd -P "$( dirname "$source" )" && pwd )"
- source="$(readlink -f "$source" 2>/dev/null || echo "$source")"
- [[ $source != /* ]] && source="$dir/$source"
- done
- echo "$( cd -P "$( dirname "$source" )" && pwd )"
- }
- TMP_FILE="${TMP_FILE:-tmp.log}"
- if [ -z "$WORKSPACE_BASE" ]; then
- WORKSPACE_BASE=$(pwd)
- fi
- DEBUG=true # needed for llm logging to create mock files!
- if [ -z "$LOG_TO_FILE" ]; then
- LOG_TO_FILE=true
- fi
- export SCRIPT_DIR=$(get_script_dir)
- export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
- export LOG_DIR="$PROJECT_ROOT/logs"
- echo "Current working directory: $(pwd)"
- echo "SCRIPT_DIR: $SCRIPT_DIR"
- echo "PROJECT_ROOT: $PROJECT_ROOT"
- echo "LOG_DIR: $LOG_DIR"
- echo "LOG_TO_FILE: $LOG_TO_FILE"
- WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
- mkdir -p "$WORKSPACE_BASE"
- chmod -R 777 "$WORKSPACE_BASE"
- WORKSPACE_BASE=$(realpath "$WORKSPACE_BASE")
- if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
- WORKSPACE_MOUNT_PATH="$WORKSPACE_BASE"
- else
- WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH}/_test_workspace"
- mkdir -p "$WORKSPACE_MOUNT_PATH"
- chmod -R 755 "$WORKSPACE_MOUNT_PATH"
- WORKSPACE_MOUNT_PATH=$(realpath "$WORKSPACE_MOUNT_PATH")
- fi
- WORKSPACE_MOUNT_PATH_IN_SANDBOX="${WORKSPACE_MOUNT_PATH_IN_SANDBOX:-/workspace}"
- echo "WORKSPACE_BASE: $WORKSPACE_BASE"
- echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
- echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
- # Ensure we're in the correct directory
- cd "$PROJECT_ROOT" || exit 1
- mkdir -p "$WORKSPACE_BASE"
- # use environmental variable if exists
- TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"
- if [ -z "$SANDBOX_BASE_CONTAINER_IMAGE" ]; then
- SANDBOX_BASE_CONTAINER_IMAGE="nikolaik/python-nodejs:python3.12-nodejs22"
- fi
- MAX_ITERATIONS=20
- echo "TEST_RUNTIME: $TEST_RUNTIME"
- agents=(
- "DelegatorAgent"
- "ManagerAgent"
- "BrowsingAgent"
- "CodeActAgent"
- "PlannerAgent"
- "CodeActSWEAgent"
- )
- tasks=(
- "Fix typos in bad.txt."
- "Write a shell script 'hello.sh' that prints 'hello'."
- "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
- "Write a git commit message for the current staging area."
- #"Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
- "Browse localhost:8000, and tell me the ultimate answer to life."
- )
- test_names=(
- "test_edits"
- "test_write_simple_script"
- "test_ipython"
- "test_simple_task_rejection"
- #"test_ipython_module" NOT DETERMINISTIC IN NUMBER OF LLM RESPONSES!
- "test_browse_internet"
- )
- num_of_tests=${#test_names[@]}
- num_of_agents=${#agents[@]}
- ##############################################################
- ## FUNCTIONS ##
- ##############################################################
- # run integration test against a specific agent & test
- run_test() {
- # Ensure we're in the correct directory
- cd "$PROJECT_ROOT" || exit 1
- local pytest_cmd="poetry run pytest --cache-clear -vvsxx $SCRIPT_DIR/test_agent.py::$test_name"
- # Check if TEST_IN_CI is defined
- if [ -n "$TEST_IN_CI" ]; then
- pytest_cmd+=" --cov=openhands --cov-report=xml --cov-append"
- fi
- env SCRIPT_DIR="$SCRIPT_DIR" \
- PROJECT_ROOT="$PROJECT_ROOT" \
- WORKSPACE_BASE="$WORKSPACE_BASE" \
- WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
- MAX_ITERATIONS="$MAX_ITERATIONS" \
- DEFAULT_AGENT=$agent \
- TEST_RUNTIME="$TEST_RUNTIME" \
- DEBUG=$DEBUG \
- LLM=$LLM \
- LOG_TO_FILE=$LOG_TO_FILE \
- FORCE_REGENERATE=$FORCE_REGENERATE \
- SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
- $pytest_cmd 2>&1 | tee "$TMP_FILE"
- # Capture the exit code of pytest
- pytest_exit_code=${PIPESTATUS[0]}
- if grep -q "docker.errors.DockerException" $TMP_FILE; then
- echo "Error: docker.errors.DockerException found in the output. Exiting."
- echo "Please check if your Docker daemon is running!"
- exit 1
- fi
- if grep -q "tenacity.RetryError" $TMP_FILE; then
- echo "Error: tenacity.RetryError found in the output. Exiting."
- echo "This is mostly a transient error. Please retry."
- exit 1
- fi
- if grep -q "ExceptionPxssh" $TMP_FILE; then
- echo "Error: ExceptionPxssh found in the output. Exiting."
- echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
- exit 1
- fi
- if grep -q "Address already in use" $TMP_FILE; then
- echo "Error: Address already in use found in the output. Exiting."
- echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
- exit 1
- fi
- # Return the exit code of pytest
- return $pytest_exit_code
- }
- # browsing capability needs a local http server
- launch_http_server() {
- poetry run python $SCRIPT_DIR/start_http_server.py &
- HTTP_SERVER_PID=$!
- echo "Test http server launched, PID = $HTTP_SERVER_PID"
- sleep 5
- }
- cleanup() {
- cd "$PROJECT_ROOT/tests"
- cd "$PROJECT_ROOT"
- echo "Cleaning up before exit..."
- if [ -n "$HTTP_SERVER_PID" ]; then
- echo "Killing HTTP server..."
- kill $HTTP_SERVER_PID || true
- unset HTTP_SERVER_PID
- fi
- [ -f "$TMP_FILE" ] && rm "$TMP_FILE"
- echo "Cleanup done!"
- }
- # Trap the EXIT signal to run the cleanup function
- if [ -z "$NOTRAP" ]; then
- trap cleanup EXIT
- fi
- # generate prompts again, using existing LLM responses under tests/integration/mock/[test_runtime]_runtime/[agent]/[test_name]/response_*.log
- # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
- # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
- # apply a small change to prompts.
- regenerate_without_llm() {
- cd "$PROJECT_ROOT"
- # set -x to print the command being executed
- set -x
- env SCRIPT_DIR="$SCRIPT_DIR" \
- PROJECT_ROOT="$PROJECT_ROOT" \
- WORKSPACE_BASE="$WORKSPACE_BASE" \
- WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
- MAX_ITERATIONS="$MAX_ITERATIONS" \
- FORCE_APPLY_PROMPTS=true \
- DEFAULT_AGENT="$agent" \
- TEST_RUNTIME="$TEST_RUNTIME" \
- LLM="$LLM" \
- DEBUG="$DEBUG" \
- LOG_TO_FILE="$LOG_TO_FILE" \
- FORCE_REGENERATE="$FORCE_REGENERATE" \
- SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
- poetry run pytest -s "$SCRIPT_DIR/test_agent.py::$test_name"
- set +x
- }
- regenerate_with_llm() {
- cd "$PROJECT_ROOT"
- rm -rf "$WORKSPACE_BASE/*"
- if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
- cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
- fi
- rm -rf "$LOG_DIR"
- rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
- # set -x to print the command being executed
- set -x
- echo -e "/exit\n" | \
- env SCRIPT_DIR="$SCRIPT_DIR" \
- PROJECT_ROOT="$PROJECT_ROOT" \
- WORKSPACE_BASE="$WORKSPACE_BASE" \
- WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
- DEFAULT_AGENT=$agent \
- RUNTIME="$TEST_RUNTIME" \
- SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
- LLM="$LLM" \
- DEBUG="$DEBUG" \
- LOG_TO_FILE="$LOG_TO_FILE" \
- FORCE_REGENERATE="$FORCE_REGENERATE" \
- poetry run python "$PROJECT_ROOT/openhands/core/main.py" \
- -i "$MAX_ITERATIONS" \
- -t "$task Do not ask me for confirmation at any point." \
- -c $agent
- set +x
- mkdir -p "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
- mv "$LOG_DIR"/llm/**/* "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
- }
- ##############################################################
- ## MAIN PROGRAM ##
- ##############################################################
- if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
- echo "Every task must correspond to one test case"
- exit 1
- fi
- rm -rf "$LOG_DIR"
- rm -rf "$WORKSPACE_BASE/*"
- for ((i = 0; i < num_of_tests; i++)); do
- task=${tasks[i]}
- test_name=${test_names[i]}
- # skip other tests if only one test is specified
- if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
- continue
- fi
- if [ "$test_name" = "test_browse_internet" ]; then
- launch_http_server
- fi
- for ((j = 0; j < num_of_agents; j++)); do
- agent=${agents[j]}
- # skip other agents if only one agent is specified
- if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
- continue
- fi
- echo -e "\n============================================================"
- echo -e "======== STEP 1: Running $test_name for $agent"
- echo -e "============================================================\n\n"
- # reset dir so getcwd() shouldn't fail
- cd "$PROJECT_ROOT/tests"
- cd "$PROJECT_ROOT"
- rm -rf "$WORKSPACE_BASE/*"
- if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
- cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
- fi
- if [ "$TEST_ONLY" ]; then
- set -e
- else
- # Temporarily disable 'exit on error'
- set +e
- fi
- TEST_STATUS=1
- if [ -z "$FORCE_REGENERATE" ]; then
- run_test
- TEST_STATUS=$?
- fi
- # Re-enable 'exit on error'
- set -e
- if [[ $TEST_STATUS -ne 0 ]]; then
- if [ "$FORCE_USE_LLM" ]; then
- echo -e "\n============================================================"
- echo -e "======== FORCE_USE_LLM, skipping step 2 & 3"
- echo -e "============================================================\n\n"
- elif [ ! -d "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name" ]; then
- echo -e "\n============================================================"
- echo -e "======== No existing mock responses for ${TEST_RUNTIME}_runtime/$agent/$test_name, skipping step 2 & 3"
- echo -e "============================================================\n\n"
- else
- echo -e "\n============================================================"
- echo -e "======== STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost"
- echo -e "============================================================\n\n"
- # Temporarily disable 'exit on error'
- set +e
- regenerate_without_llm
- echo -e "\n============================================================"
- echo -e "======== STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify"
- echo -e "============================================================\n\n\n"
- run_test
- TEST_STATUS=$?
- # Re-enable 'exit on error'
- set -e
- fi
- if [[ $TEST_STATUS -ne 0 ]]; then
- echo -e "\n============================================================"
- if [ "$FORCE_USE_LLM" ]; then
- echo -e "======== STEP 4: $test_name REGENERATION for $agent WITH money cost"
- else
- echo -e "======== STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost"
- fi
- echo -e "============================================================\n\n\n"
- regenerate_with_llm
- echo -e "\n============================================================"
- echo -e "======== STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify"
- echo -e "============================================================\n\n\n"
- cd "$PROJECT_ROOT/tests"
- cd "$PROJECT_ROOT"
- # Temporarily disable 'exit on error'
- set +e
- run_test
- TEST_STATUS=$?
- # Re-enable 'exit on error'
- set -e
- if [[ $TEST_STATUS -ne 0 ]]; then
- echo -e "\n\n============================================================"
- echo -e "========== $test_name for $agent RERUN FAILED"
- echo -e "============================================================"
- echo -e "There are multiple possibilities:"
- echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
- echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
- echo -e " 3. There is something non-deterministic in the prompt."
- echo -e " 4. There is a bug in this script, or in OpenHands code."
- echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
- echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
- echo -e "============================================================\n\n"
- exit 1
- else
- echo -e "\n\n============================================================"
- echo -e "========$test_name for $agent RERUN PASSED"
- echo -e "============================================================\n\n\n"
- sleep 1
- fi
- else
- echo -e "\n\n============================================================"
- echo -e "========$test_name for $agent RERUN PASSED"
- echo -e "============================================================\n\n\n"
- sleep 1
- fi
- else
- echo -e "\n\n============================================================"
- echo -e "\n========== $test_name for $agent PASSED"
- echo -e "\n============================================================\n\n\n"
- sleep 1
- fi
- done
- if [ "$test_name" = "test_browse_internet" ]; then
- kill $HTTP_SERVER_PID || true
- fi
- done
- rm -rf "$LOG_DIR"
- rm -rf "$WORKSPACE_BASE"
- echo "Done!"
- cd "$PROJECT_ROOT"
|