regenerate.sh 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. #!/bin/bash
  2. set -eo pipefail
  3. ##############################################################
  4. ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
  5. ##############################################################
  6. if [ -z $WORKSPACE_MOUNT_PATH ]; then
  7. WORKSPACE_MOUNT_PATH=$(pwd)
  8. fi
  9. if [ -z $WORKSPACE_BASE ]; then
  10. WORKSPACE_BASE=$(pwd)
  11. fi
  12. WORKSPACE_MOUNT_PATH+="/_test_workspace"
  13. WORKSPACE_BASE+="/_test_workspace"
  14. WORKSPACE_MOUNT_PATH_IN_SANDBOX="/workspace"
  15. mkdir -p $WORKSPACE_BASE
  16. # use environmental variable if exist, otherwise use "ssh"
  17. SANDBOX_TYPE="${SANDBOX_TYPE:-ssh}"
  18. MAX_ITERATIONS=10
  19. agents=("DelegatorAgent" "ManagerAgent" "BrowsingAgent" "MonologueAgent" "CodeActAgent" "PlannerAgent")
  20. tasks=(
  21. "Fix typos in bad.txt."
  22. "Write a shell script 'hello.sh' that prints 'hello'."
  23. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  24. "Write a git commit message for the current staging area."
  25. "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
  26. "Browse localhost:8000, and tell me the ultimate answer to life."
  27. )
  28. test_names=(
  29. "test_edits"
  30. "test_write_simple_script"
  31. "test_ipython"
  32. "test_simple_task_rejection"
  33. "test_ipython_module"
  34. "test_browse_internet"
  35. )
  36. num_of_tests=${#test_names[@]}
  37. num_of_agents=${#agents[@]}
  38. ##############################################################
  39. ## FUNCTIONS ##
  40. ##############################################################
  41. # run integration test against a specific agent & test
  42. run_test() {
  43. local pytest_cmd="poetry run pytest -s ./tests/integration/test_agent.py::$test_name"
  44. # Check if TEST_IN_CI is defined
  45. if [ -n "$TEST_IN_CI" ]; then
  46. pytest_cmd+=" --cov=agenthub --cov=opendevin --cov-report=xml --cov-append"
  47. fi
  48. SANDBOX_TYPE=$SANDBOX_TYPE \
  49. WORKSPACE_BASE=$WORKSPACE_BASE \
  50. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  51. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  52. MAX_ITERATIONS=$MAX_ITERATIONS \
  53. AGENT=$agent \
  54. $pytest_cmd
  55. # return exit code of pytest
  56. return $?
  57. }
  58. # browsing capability needs a local http server
  59. launch_http_server() {
  60. poetry run python tests/integration/start_http_server.py &
  61. HTTP_SERVER_PID=$!
  62. sleep 10
  63. }
  64. # generate prompts again, using existing LLM responses under tests/integration/mock/[agent]/[test_name]/response_*.log
  65. # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
  66. # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
  67. # apply a small change to prompts.
  68. regenerate_without_llm() {
  69. # set -x to print the command being executed
  70. set -x
  71. SANDBOX_TYPE=$SANDBOX_TYPE \
  72. WORKSPACE_BASE=$WORKSPACE_BASE \
  73. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  74. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  75. MAX_ITERATIONS=$MAX_ITERATIONS \
  76. FORCE_APPLY_PROMPTS=true \
  77. AGENT=$agent \
  78. poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  79. set +x
  80. }
  81. regenerate_with_llm() {
  82. if [[ "$test_name" = "test_browse_internet" ]]; then
  83. launch_http_server
  84. fi
  85. rm -rf $WORKSPACE_BASE
  86. mkdir -p $WORKSPACE_BASE
  87. if [ -d "tests/integration/workspace/$test_name" ]; then
  88. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  89. fi
  90. rm -rf logs
  91. rm -rf tests/integration/mock/$agent/$test_name/*
  92. # set -x to print the command being executed
  93. set -x
  94. echo -e "/exit\n" | \
  95. DEBUG=true \
  96. SANDBOX_TYPE=$SANDBOX_TYPE \
  97. WORKSPACE_BASE=$WORKSPACE_BASE \
  98. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
  99. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  100. poetry run python ./opendevin/core/main.py \
  101. -i $MAX_ITERATIONS \
  102. -t "$task Do not ask me for confirmation at any point." \
  103. -c $agent
  104. set +x
  105. mkdir -p tests/integration/mock/$agent/$test_name/
  106. mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
  107. if [[ "$test_name" = "test_browse_internet" ]]; then
  108. # Terminate the HTTP server
  109. kill $HTTP_SERVER_PID
  110. fi
  111. }
  112. ##############################################################
  113. ## MAIN PROGRAM ##
  114. ##############################################################
  115. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  116. echo "Every task must correspond to one test case"
  117. exit 1
  118. fi
  119. rm -rf logs
  120. rm -rf $WORKSPACE_BASE
  121. for ((i = 0; i < num_of_tests; i++)); do
  122. task=${tasks[i]}
  123. test_name=${test_names[i]}
  124. # skip other tests if only one test is specified
  125. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  126. continue
  127. fi
  128. for ((j = 0; j < num_of_agents; j++)); do
  129. agent=${agents[j]}
  130. # skip other agents if only one agent is specified
  131. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  132. continue
  133. fi
  134. echo -e "\n\n\n\n========STEP 1: Running $test_name for $agent========\n\n\n\n"
  135. rm -rf $WORKSPACE_BASE
  136. mkdir $WORKSPACE_BASE
  137. if [ -d "tests/integration/workspace/$test_name" ]; then
  138. cp -r "tests/integration/workspace/$test_name"/* $WORKSPACE_BASE
  139. fi
  140. if [ "$TEST_ONLY" = true ]; then
  141. set -e
  142. else
  143. # Temporarily disable 'exit on error'
  144. set +e
  145. fi
  146. TEST_STATUS=1
  147. if [ -z $SKIP_TEST ]; then
  148. run_test
  149. TEST_STATUS=$?
  150. fi
  151. # Re-enable 'exit on error'
  152. set -e
  153. if [[ $TEST_STATUS -ne 0 ]]; then
  154. if [ "$FORCE_USE_LLM" = true ]; then
  155. echo -e "\n\n\n\n========FORCE_USE_LLM, skipping step 2 & 3========\n\n\n\n"
  156. elif [ ! -d "tests/integration/mock/$agent/$test_name" ]; then
  157. echo -e "\n\n\n\n========No existing mock responses for $agent/$test_name, skipping step 2 & 3========\n\n\n\n"
  158. else
  159. echo -e "\n\n\n\n========STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost========\n\n\n\n"
  160. # Temporarily disable 'exit on error'
  161. set +e
  162. regenerate_without_llm
  163. echo -e "\n\n\n\n========STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify========\n\n\n\n"
  164. run_test
  165. TEST_STATUS=$?
  166. # Re-enable 'exit on error'
  167. set -e
  168. fi
  169. if [[ $TEST_STATUS -ne 0 ]]; then
  170. echo -e "\n\n\n\n========STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost========\n\n\n\n"
  171. regenerate_with_llm
  172. echo -e "\n\n\n\n========STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify========\n\n\n\n"
  173. # Temporarily disable 'exit on error'
  174. set +e
  175. run_test
  176. TEST_STATUS=$?
  177. # Re-enable 'exit on error'
  178. set -e
  179. if [[ $TEST_STATUS -ne 0 ]]; then
  180. echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
  181. echo -e "There are multiple possibilities:"
  182. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  183. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  184. echo -e " 3. There is something non-deterministic in the prompt."
  185. echo -e " 4. There is a bug in this script, or in OpenDevin code."
  186. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  187. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  188. exit 1
  189. else
  190. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  191. sleep 1
  192. fi
  193. else
  194. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  195. sleep 1
  196. fi
  197. else
  198. echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
  199. sleep 1
  200. fi
  201. done
  202. done
  203. rm -rf logs
  204. rm -rf $WORKSPACE_BASE
  205. echo "Done!"