regenerate.sh 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #!/bin/bash
  2. set -eo pipefail
  3. ##############################################################
  4. ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
  5. ##############################################################
  6. # unset environmental variables that might disturb testing
  7. unset OPENAI_API_KEY
  8. unset SANDBOX_ENV_OPENAI_API_KEY
  9. unset OPENAI_BASE_URL
  10. unset OPENAI_MODEL
  11. TMP_FILE="${TMP_FILE:-tmp.log}"
  12. if [ -z $WORKSPACE_MOUNT_PATH ]; then
  13. WORKSPACE_MOUNT_PATH=$(pwd)
  14. fi
  15. if [ -z $WORKSPACE_BASE ]; then
  16. WORKSPACE_BASE=$(pwd)
  17. fi
  18. WORKSPACE_MOUNT_PATH+="/_test_workspace"
  19. WORKSPACE_BASE+="/_test_workspace"
  20. WORKSPACE_MOUNT_PATH_IN_SANDBOX="/workspace"
  21. echo "WORKSPACE_BASE: $WORKSPACE_BASE"
  22. echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
  23. echo "WORKSPACE_MOUNT_PATH_IN_SANDBOX: $WORKSPACE_MOUNT_PATH_IN_SANDBOX"
  24. mkdir -p $WORKSPACE_BASE
  25. # use environmental variable if exists, otherwise use "ssh"
  26. SANDBOX_BOX_TYPE="${SANDBOX_TYPE:-ssh}"
  27. # TODO: we should also test PERSIST_SANDBOX = true, once it's fixed
  28. PERSIST_SANDBOX=false
  29. MAX_ITERATIONS=15
  30. agents=(
  31. "DelegatorAgent"
  32. "ManagerAgent"
  33. "BrowsingAgent"
  34. "MonologueAgent"
  35. "CodeActAgent"
  36. "PlannerAgent"
  37. "CodeActSWEAgent"
  38. )
  39. tasks=(
  40. "Fix typos in bad.txt."
  41. "Write a shell script 'hello.sh' that prints 'hello'."
  42. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  43. "Write a git commit message for the current staging area."
  44. "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
  45. "Browse localhost:8000, and tell me the ultimate answer to life."
  46. )
  47. test_names=(
  48. "test_edits"
  49. "test_write_simple_script"
  50. "test_ipython"
  51. "test_simple_task_rejection"
  52. "test_ipython_module"
  53. "test_browse_internet"
  54. )
  55. num_of_tests=${#test_names[@]}
  56. num_of_agents=${#agents[@]}
  57. ##############################################################
  58. ## FUNCTIONS ##
  59. ##############################################################
  60. # run integration test against a specific agent & test
  61. run_test() {
  62. local pytest_cmd="poetry run pytest -s ./tests/integration/test_agent.py::$test_name"
  63. # Check if TEST_IN_CI is defined
  64. if [ -n "$TEST_IN_CI" ]; then
  65. pytest_cmd+=" --cov=agenthub --cov=opendevin --cov-report=xml --cov-append"
  66. fi
  67. SANDBOX_BOX_TYPE=$SANDBOX_BOX_TYPE \
  68. PERSIST_SANDBOX=$PERSIST_SANDBOX \
  69. WORKSPACE_BASE=$WORKSPACE_BASE \
  70. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  71. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  72. MAX_ITERATIONS=$MAX_ITERATIONS \
  73. DEFAULT_AGENT=$agent \
  74. $pytest_cmd 2>&1 | tee $TMP_FILE
  75. # Capture the exit code of pytest
  76. pytest_exit_code=${PIPESTATUS[0]}
  77. if grep -q "docker.errors.DockerException" $TMP_FILE; then
  78. echo "Error: docker.errors.DockerException found in the output. Exiting."
  79. echo "Please check if your Docker daemon is running!"
  80. exit 1
  81. fi
  82. if grep -q "tenacity.RetryError" $TMP_FILE; then
  83. echo "Error: tenacity.RetryError found in the output. Exiting."
  84. echo "This is mostly a transient error. Please retry."
  85. exit 1
  86. fi
  87. if grep -q "ExceptionPxssh" $TMP_FILE; then
  88. echo "Error: ExceptionPxssh found in the output. Exiting."
  89. echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
  90. exit 1
  91. fi
  92. if grep -q "Address already in use" $TMP_FILE; then
  93. echo "Error: Address already in use found in the output. Exiting."
  94. echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
  95. exit 1
  96. fi
  97. # Return the exit code of pytest
  98. return $pytest_exit_code
  99. }
  100. # browsing capability needs a local http server
  101. launch_http_server() {
  102. poetry run python tests/integration/start_http_server.py &
  103. HTTP_SERVER_PID=$!
  104. echo "Test http server launched, PID = $HTTP_SERVER_PID"
  105. sleep 10
  106. }
  107. cleanup() {
  108. echo "Cleaning up before exit..."
  109. if [ -n "$HTTP_SERVER_PID" ]; then
  110. echo "Killing HTTP server..."
  111. kill $HTTP_SERVER_PID
  112. unset HTTP_SERVER_PID
  113. fi
  114. [ -f $TMP_FILE ] && rm $TMP_FILE
  115. echo "Cleanup done!"
  116. }
  117. # Trap the EXIT signal to run the cleanup function
  118. trap cleanup EXIT
  119. # generate prompts again, using existing LLM responses under tests/integration/mock/[agent]/[test_name]/response_*.log
  120. # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
  121. # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
  122. # apply a small change to prompts.
  123. regenerate_without_llm() {
  124. # set -x to print the command being executed
  125. set -x
  126. SANDBOX_BOX_TYPE=$SANDBOX_BOX_TYPE \
  127. PERSIST_SANDBOX=$PERSIST_SANDBOX \
  128. WORKSPACE_BASE=$WORKSPACE_BASE \
  129. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  130. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  131. MAX_ITERATIONS=$MAX_ITERATIONS \
  132. FORCE_APPLY_PROMPTS=true \
  133. DEFAULT_AGENT=$agent \
  134. poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  135. set +x
  136. }
  137. regenerate_with_llm() {
  138. if [[ "$test_name" = "test_browse_internet" ]]; then
  139. launch_http_server
  140. fi
  141. rm -rf $WORKSPACE_BASE/*
  142. if [ -d "tests/integration/workspace/$test_name" ]; then
  143. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  144. fi
  145. rm -rf logs
  146. rm -rf tests/integration/mock/$agent/$test_name/*
  147. # set -x to print the command being executed
  148. set -x
  149. echo -e "/exit\n" | \
  150. DEBUG=true \
  151. SANDBOX_BOX_TYPE=$SANDBOX_BOX_TYPE \
  152. PERSIST_SANDBOX=$PERSIST_SANDBOX \
  153. WORKSPACE_BASE=$WORKSPACE_BASE \
  154. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
  155. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  156. poetry run python ./opendevin/core/main.py \
  157. -i $MAX_ITERATIONS \
  158. -t "$task Do not ask me for confirmation at any point." \
  159. -c $agent
  160. set +x
  161. mkdir -p tests/integration/mock/$agent/$test_name/
  162. mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
  163. }
  164. ##############################################################
  165. ## MAIN PROGRAM ##
  166. ##############################################################
  167. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  168. echo "Every task must correspond to one test case"
  169. exit 1
  170. fi
  171. rm -rf logs
  172. rm -rf $WORKSPACE_BASE/*
  173. for ((i = 0; i < num_of_tests; i++)); do
  174. task=${tasks[i]}
  175. test_name=${test_names[i]}
  176. # skip other tests if only one test is specified
  177. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  178. continue
  179. fi
  180. for ((j = 0; j < num_of_agents; j++)); do
  181. agent=${agents[j]}
  182. # skip other agents if only one agent is specified
  183. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  184. continue
  185. fi
  186. echo -e "\n\n\n\n========STEP 1: Running $test_name for $agent========\n\n\n\n"
  187. rm -rf $WORKSPACE_BASE/*
  188. if [ -d "tests/integration/workspace/$test_name" ]; then
  189. cp -r "tests/integration/workspace/$test_name"/* $WORKSPACE_BASE
  190. fi
  191. if [ "$TEST_ONLY" = true ]; then
  192. set -e
  193. else
  194. # Temporarily disable 'exit on error'
  195. set +e
  196. fi
  197. TEST_STATUS=1
  198. if [ -z $FORCE_REGENERATE ]; then
  199. run_test
  200. TEST_STATUS=$?
  201. fi
  202. # Re-enable 'exit on error'
  203. set -e
  204. if [[ $TEST_STATUS -ne 0 ]]; then
  205. if [ "$FORCE_USE_LLM" = true ]; then
  206. echo -e "\n\n\n\n========FORCE_USE_LLM, skipping step 2 & 3========\n\n\n\n"
  207. elif [ ! -d "tests/integration/mock/$agent/$test_name" ]; then
  208. echo -e "\n\n\n\n========No existing mock responses for $agent/$test_name, skipping step 2 & 3========\n\n\n\n"
  209. else
  210. echo -e "\n\n\n\n========STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost========\n\n\n\n"
  211. # Temporarily disable 'exit on error'
  212. set +e
  213. regenerate_without_llm
  214. echo -e "\n\n\n\n========STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify========\n\n\n\n"
  215. run_test
  216. TEST_STATUS=$?
  217. # Re-enable 'exit on error'
  218. set -e
  219. fi
  220. if [[ $TEST_STATUS -ne 0 ]]; then
  221. echo -e "\n\n\n\n========STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost========\n\n\n\n"
  222. regenerate_with_llm
  223. echo -e "\n\n\n\n========STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify========\n\n\n\n"
  224. # Temporarily disable 'exit on error'
  225. set +e
  226. run_test
  227. TEST_STATUS=$?
  228. # Re-enable 'exit on error'
  229. set -e
  230. if [[ $TEST_STATUS -ne 0 ]]; then
  231. echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
  232. echo -e "There are multiple possibilities:"
  233. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  234. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  235. echo -e " 3. There is something non-deterministic in the prompt."
  236. echo -e " 4. There is a bug in this script, or in OpenDevin code."
  237. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  238. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  239. exit 1
  240. else
  241. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  242. sleep 1
  243. fi
  244. else
  245. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  246. sleep 1
  247. fi
  248. else
  249. echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
  250. sleep 1
  251. fi
  252. done
  253. done
  254. rm -rf logs
  255. rm -rf $WORKSPACE_BASE
  256. echo "Done!"