regenerate.sh 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. #!/bin/bash
  2. set -eo pipefail
  3. ##############################################################
  4. ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
  5. ##############################################################
  6. echo -e "\n\n============================================================"
  7. # unset environmental variables that might disturb testing
  8. unset OPENAI_API_KEY
  9. unset SANDBOX_ENV_OPENAI_API_KEY
  10. unset OPENAI_BASE_URL
  11. unset OPENAI_MODEL
  12. # Get the absolute path of the script directory
  13. get_script_dir() {
  14. local source="${BASH_SOURCE[0]}"
  15. while [ -h "$source" ]; do
  16. local dir="$( cd -P "$( dirname "$source" )" && pwd )"
  17. source="$(readlink -f "$source" 2>/dev/null || echo "$source")"
  18. [[ $source != /* ]] && source="$dir/$source"
  19. done
  20. echo "$( cd -P "$( dirname "$source" )" && pwd )"
  21. }
  22. TMP_FILE="${TMP_FILE:-tmp.log}"
  23. if [ -z "$WORKSPACE_BASE" ]; then
  24. WORKSPACE_BASE=$(pwd)
  25. fi
  26. DEBUG=true # needed for llm logging to create mock files!
  27. if [ -z "$LOG_TO_FILE" ]; then
  28. LOG_TO_FILE=true
  29. fi
  30. export SCRIPT_DIR=$(get_script_dir)
  31. export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
  32. export LOG_DIR="$PROJECT_ROOT/logs"
  33. echo "Current working directory: $(pwd)"
  34. echo "SCRIPT_DIR: $SCRIPT_DIR"
  35. echo "PROJECT_ROOT: $PROJECT_ROOT"
  36. echo "LOG_DIR: $LOG_DIR"
  37. echo "LOG_TO_FILE: $LOG_TO_FILE"
  38. WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
  39. mkdir -p "$WORKSPACE_BASE"
  40. chmod -R 777 "$WORKSPACE_BASE"
  41. WORKSPACE_BASE=$(realpath "$WORKSPACE_BASE")
  42. if [ -z "$WORKSPACE_MOUNT_PATH" ]; then
  43. WORKSPACE_MOUNT_PATH="$WORKSPACE_BASE"
  44. else
  45. WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH}/_test_workspace"
  46. mkdir -p "$WORKSPACE_MOUNT_PATH"
  47. chmod -R 755 "$WORKSPACE_MOUNT_PATH"
  48. WORKSPACE_MOUNT_PATH=$(realpath "$WORKSPACE_MOUNT_PATH")
  49. fi
  50. WORKSPACE_MOUNT_PATH_IN_SANDBOX="${WORKSPACE_MOUNT_PATH_IN_SANDBOX:-/workspace}"
  51. echo "WORKSPACE_BASE: $WORKSPACE_BASE"
  52. echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
  53. echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
  54. # Ensure we're in the correct directory
  55. cd "$PROJECT_ROOT" || exit 1
  56. mkdir -p "$WORKSPACE_BASE"
  57. # use environmental variable if exists
  58. TEST_RUNTIME="${TEST_RUNTIME:-eventstream}"
  59. if [ -z "$SANDBOX_BASE_CONTAINER_IMAGE" ]; then
  60. SANDBOX_BASE_CONTAINER_IMAGE="nikolaik/python-nodejs:python3.12-nodejs22"
  61. fi
  62. MAX_ITERATIONS=20
  63. echo "TEST_RUNTIME: $TEST_RUNTIME"
  64. agents=(
  65. "DelegatorAgent"
  66. "ManagerAgent"
  67. "BrowsingAgent"
  68. "CodeActAgent"
  69. "PlannerAgent"
  70. "CodeActSWEAgent"
  71. )
  72. tasks=(
  73. "Fix typos in bad.txt."
  74. "Write a shell script 'hello.sh' that prints 'hello'."
  75. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  76. "Write a git commit message for the current staging area."
  77. #"Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
  78. "Browse localhost:8000, and tell me the ultimate answer to life."
  79. )
  80. test_names=(
  81. "test_edits"
  82. "test_write_simple_script"
  83. "test_ipython"
  84. "test_simple_task_rejection"
  85. #"test_ipython_module" NOT DETERMINISTIC IN NUMBER OF LLM RESPONSES!
  86. "test_browse_internet"
  87. )
  88. num_of_tests=${#test_names[@]}
  89. num_of_agents=${#agents[@]}
  90. ##############################################################
  91. ## FUNCTIONS ##
  92. ##############################################################
  93. # run integration test against a specific agent & test
  94. run_test() {
  95. # Ensure we're in the correct directory
  96. cd "$PROJECT_ROOT" || exit 1
  97. local pytest_cmd="poetry run pytest --cache-clear -vvsxx $SCRIPT_DIR/test_agent.py::$test_name"
  98. # Check if TEST_IN_CI is defined
  99. if [ -n "$TEST_IN_CI" ]; then
  100. pytest_cmd+=" --cov=openhands --cov-report=xml --cov-append"
  101. fi
  102. env SCRIPT_DIR="$SCRIPT_DIR" \
  103. PROJECT_ROOT="$PROJECT_ROOT" \
  104. WORKSPACE_BASE="$WORKSPACE_BASE" \
  105. WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
  106. MAX_ITERATIONS="$MAX_ITERATIONS" \
  107. DEFAULT_AGENT=$agent \
  108. TEST_RUNTIME="$TEST_RUNTIME" \
  109. DEBUG=$DEBUG \
  110. LLM=$LLM \
  111. LOG_TO_FILE=$LOG_TO_FILE \
  112. FORCE_REGENERATE=$FORCE_REGENERATE \
  113. SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
  114. $pytest_cmd 2>&1 | tee "$TMP_FILE"
  115. # Capture the exit code of pytest
  116. pytest_exit_code=${PIPESTATUS[0]}
  117. if grep -q "docker.errors.DockerException" $TMP_FILE; then
  118. echo "Error: docker.errors.DockerException found in the output. Exiting."
  119. echo "Please check if your Docker daemon is running!"
  120. exit 1
  121. fi
  122. if grep -q "tenacity.RetryError" $TMP_FILE; then
  123. echo "Error: tenacity.RetryError found in the output. Exiting."
  124. echo "This is mostly a transient error. Please retry."
  125. exit 1
  126. fi
  127. if grep -q "ExceptionPxssh" $TMP_FILE; then
  128. echo "Error: ExceptionPxssh found in the output. Exiting."
  129. echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
  130. exit 1
  131. fi
  132. if grep -q "Address already in use" $TMP_FILE; then
  133. echo "Error: Address already in use found in the output. Exiting."
  134. echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
  135. exit 1
  136. fi
  137. # Return the exit code of pytest
  138. return $pytest_exit_code
  139. }
  140. # browsing capability needs a local http server
  141. launch_http_server() {
  142. poetry run python $SCRIPT_DIR/start_http_server.py &
  143. HTTP_SERVER_PID=$!
  144. echo "Test http server launched, PID = $HTTP_SERVER_PID"
  145. sleep 5
  146. }
  147. cleanup() {
  148. cd "$PROJECT_ROOT/tests"
  149. cd "$PROJECT_ROOT"
  150. echo "Cleaning up before exit..."
  151. if [ -n "$HTTP_SERVER_PID" ]; then
  152. echo "Killing HTTP server..."
  153. kill $HTTP_SERVER_PID || true
  154. unset HTTP_SERVER_PID
  155. fi
  156. [ -f "$TMP_FILE" ] && rm "$TMP_FILE"
  157. echo "Cleanup done!"
  158. }
  159. # Trap the EXIT signal to run the cleanup function
  160. if [ -z "$NOTRAP" ]; then
  161. trap cleanup EXIT
  162. fi
  163. # generate prompts again, using existing LLM responses under tests/integration/mock/[test_runtime]_runtime/[agent]/[test_name]/response_*.log
  164. # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
  165. # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
  166. # apply a small change to prompts.
  167. regenerate_without_llm() {
  168. cd "$PROJECT_ROOT"
  169. # set -x to print the command being executed
  170. set -x
  171. env SCRIPT_DIR="$SCRIPT_DIR" \
  172. PROJECT_ROOT="$PROJECT_ROOT" \
  173. WORKSPACE_BASE="$WORKSPACE_BASE" \
  174. WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
  175. MAX_ITERATIONS="$MAX_ITERATIONS" \
  176. FORCE_APPLY_PROMPTS=true \
  177. DEFAULT_AGENT="$agent" \
  178. TEST_RUNTIME="$TEST_RUNTIME" \
  179. LLM="$LLM" \
  180. DEBUG="$DEBUG" \
  181. LOG_TO_FILE="$LOG_TO_FILE" \
  182. FORCE_REGENERATE="$FORCE_REGENERATE" \
  183. SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
  184. poetry run pytest -s "$SCRIPT_DIR/test_agent.py::$test_name"
  185. set +x
  186. }
  187. regenerate_with_llm() {
  188. cd "$PROJECT_ROOT"
  189. rm -rf "$WORKSPACE_BASE/*"
  190. if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
  191. cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
  192. fi
  193. rm -rf "$LOG_DIR"
  194. rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
  195. # set -x to print the command being executed
  196. set -x
  197. echo -e "/exit\n" | \
  198. env SCRIPT_DIR="$SCRIPT_DIR" \
  199. PROJECT_ROOT="$PROJECT_ROOT" \
  200. WORKSPACE_BASE="$WORKSPACE_BASE" \
  201. WORKSPACE_MOUNT_PATH="$WORKSPACE_MOUNT_PATH" \
  202. DEFAULT_AGENT=$agent \
  203. RUNTIME="$TEST_RUNTIME" \
  204. SANDBOX_BASE_CONTAINER_IMAGE="$SANDBOX_BASE_CONTAINER_IMAGE" \
  205. LLM="$LLM" \
  206. DEBUG="$DEBUG" \
  207. LOG_TO_FILE="$LOG_TO_FILE" \
  208. FORCE_REGENERATE="$FORCE_REGENERATE" \
  209. poetry run python "$PROJECT_ROOT/openhands/core/main.py" \
  210. -i "$MAX_ITERATIONS" \
  211. -t "$task Do not ask me for confirmation at any point." \
  212. -c $agent
  213. set +x
  214. mkdir -p "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
  215. mv "$LOG_DIR"/llm/**/* "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
  216. }
  217. ##############################################################
  218. ## MAIN PROGRAM ##
  219. ##############################################################
  220. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  221. echo "Every task must correspond to one test case"
  222. exit 1
  223. fi
  224. rm -rf "$LOG_DIR"
  225. rm -rf "$WORKSPACE_BASE/*"
  226. for ((i = 0; i < num_of_tests; i++)); do
  227. task=${tasks[i]}
  228. test_name=${test_names[i]}
  229. # skip other tests if only one test is specified
  230. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  231. continue
  232. fi
  233. if [ "$test_name" = "test_browse_internet" ]; then
  234. launch_http_server
  235. fi
  236. for ((j = 0; j < num_of_agents; j++)); do
  237. agent=${agents[j]}
  238. # skip other agents if only one agent is specified
  239. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  240. continue
  241. fi
  242. echo -e "\n============================================================"
  243. echo -e "======== STEP 1: Running $test_name for $agent"
  244. echo -e "============================================================\n\n"
  245. # reset dir so getcwd() shouldn't fail
  246. cd "$PROJECT_ROOT/tests"
  247. cd "$PROJECT_ROOT"
  248. rm -rf "$WORKSPACE_BASE/*"
  249. if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
  250. cp -r "$SCRIPT_DIR/workspace/$test_name"/* "$WORKSPACE_BASE"
  251. fi
  252. if [ "$TEST_ONLY" ]; then
  253. set -e
  254. else
  255. # Temporarily disable 'exit on error'
  256. set +e
  257. fi
  258. TEST_STATUS=1
  259. if [ -z "$FORCE_REGENERATE" ]; then
  260. run_test
  261. TEST_STATUS=$?
  262. fi
  263. # Re-enable 'exit on error'
  264. set -e
  265. if [[ $TEST_STATUS -ne 0 ]]; then
  266. if [ "$FORCE_USE_LLM" ]; then
  267. echo -e "\n============================================================"
  268. echo -e "======== FORCE_USE_LLM, skipping step 2 & 3"
  269. echo -e "============================================================\n\n"
  270. elif [ ! -d "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name" ]; then
  271. echo -e "\n============================================================"
  272. echo -e "======== No existing mock responses for ${TEST_RUNTIME}_runtime/$agent/$test_name, skipping step 2 & 3"
  273. echo -e "============================================================\n\n"
  274. else
  275. echo -e "\n============================================================"
  276. echo -e "======== STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost"
  277. echo -e "============================================================\n\n"
  278. # Temporarily disable 'exit on error'
  279. set +e
  280. regenerate_without_llm
  281. echo -e "\n============================================================"
  282. echo -e "======== STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify"
  283. echo -e "============================================================\n\n\n"
  284. run_test
  285. TEST_STATUS=$?
  286. # Re-enable 'exit on error'
  287. set -e
  288. fi
  289. if [[ $TEST_STATUS -ne 0 ]]; then
  290. echo -e "\n============================================================"
  291. if [ "$FORCE_USE_LLM" ]; then
  292. echo -e "======== STEP 4: $test_name REGENERATION for $agent WITH money cost"
  293. else
  294. echo -e "======== STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost"
  295. fi
  296. echo -e "============================================================\n\n\n"
  297. regenerate_with_llm
  298. echo -e "\n============================================================"
  299. echo -e "======== STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify"
  300. echo -e "============================================================\n\n\n"
  301. cd "$PROJECT_ROOT/tests"
  302. cd "$PROJECT_ROOT"
  303. # Temporarily disable 'exit on error'
  304. set +e
  305. run_test
  306. TEST_STATUS=$?
  307. # Re-enable 'exit on error'
  308. set -e
  309. if [[ $TEST_STATUS -ne 0 ]]; then
  310. echo -e "\n\n============================================================"
  311. echo -e "========== $test_name for $agent RERUN FAILED"
  312. echo -e "============================================================"
  313. echo -e "There are multiple possibilities:"
  314. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  315. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  316. echo -e " 3. There is something non-deterministic in the prompt."
  317. echo -e " 4. There is a bug in this script, or in OpenHands code."
  318. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  319. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  320. echo -e "============================================================\n\n"
  321. exit 1
  322. else
  323. echo -e "\n\n============================================================"
  324. echo -e "========$test_name for $agent RERUN PASSED"
  325. echo -e "============================================================\n\n\n"
  326. sleep 1
  327. fi
  328. else
  329. echo -e "\n\n============================================================"
  330. echo -e "========$test_name for $agent RERUN PASSED"
  331. echo -e "============================================================\n\n\n"
  332. sleep 1
  333. fi
  334. else
  335. echo -e "\n\n============================================================"
  336. echo -e "\n========== $test_name for $agent PASSED"
  337. echo -e "\n============================================================\n\n\n"
  338. sleep 1
  339. fi
  340. done
  341. if [ "$test_name" = "test_browse_internet" ]; then
  342. kill $HTTP_SERVER_PID || true
  343. fi
  344. done
  345. rm -rf "$LOG_DIR"
  346. rm -rf "$WORKSPACE_BASE"
  347. echo "Done!"
  348. cd "$PROJECT_ROOT"