regenerate.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #!/bin/bash
  2. set -eo pipefail
  3. ##############################################################
  4. ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
  5. ##############################################################
  6. # unset environmental variables that might disturb testing
  7. unset OPENAI_API_KEY
  8. unset SANDBOX_ENV_OPENAI_API_KEY
  9. unset OPENAI_BASE_URL
  10. unset OPENAI_MODEL
  11. # Get the absolute path of the script directory
  12. get_script_dir() {
  13. local source="${BASH_SOURCE[0]}"
  14. while [ -h "$source" ]; do
  15. local dir="$( cd -P "$( dirname "$source" )" && pwd )"
  16. source="$(readlink "$source")"
  17. [[ $source != /* ]] && source="$dir/$source"
  18. done
  19. echo "$( cd -P "$( dirname "$source" )" && pwd )"
  20. }
  21. TMP_FILE="${TMP_FILE:-tmp.log}"
  22. if [ -z $WORKSPACE_MOUNT_PATH ]; then
  23. WORKSPACE_MOUNT_PATH=$(pwd)
  24. fi
  25. if [ -z $WORKSPACE_BASE ]; then
  26. WORKSPACE_BASE=$(pwd)
  27. fi
  28. export SCRIPT_DIR=$(get_script_dir)
  29. export PROJECT_ROOT=$(realpath "$SCRIPT_DIR/../..")
  30. WORKSPACE_BASE=${WORKSPACE_BASE}/_test_workspace
  31. mkdir -p $WORKSPACE_BASE
  32. chmod -R 777 $WORKSPACE_BASE
  33. WORKSPACE_BASE=$(realpath $WORKSPACE_BASE)
  34. WORKSPACE_MOUNT_PATH=${WORKSPACE_MOUNT_PATH}/_test_workspace
  35. mkdir -p $WORKSPACE_MOUNT_PATH
  36. chmod -R 777 $WORKSPACE_MOUNT_PATH
  37. WORKSPACE_MOUNT_PATH=$(realpath $WORKSPACE_MOUNT_PATH)
  38. echo "Current working directory: $(pwd)"
  39. echo "SCRIPT_DIR: $SCRIPT_DIR"
  40. echo "PROJECT_ROOT: $PROJECT_ROOT"
  41. echo "WORKSPACE_BASE: $WORKSPACE_BASE"
  42. echo "WORKSPACE_MOUNT_PATH: $WORKSPACE_MOUNT_PATH"
  43. # Ensure we're in the correct directory
  44. cd "$PROJECT_ROOT" || exit 1
  45. mkdir -p $WORKSPACE_BASE
  46. # use environmental variable if exists, otherwise use "ssh"
  47. TEST_RUNTIME="${TEST_RUNTIME:-eventstream}" # can be server or eventstream
  48. # TODO: set this as default after ServerRuntime is deprecated
  49. if [ "$TEST_RUNTIME" == "eventstream" ] && [ -z "$SANDBOX_CONTAINER_IMAGE" ]; then
  50. SANDBOX_CONTAINER_IMAGE="nikolaik/python-nodejs:python3.11-nodejs22"
  51. fi
  52. MAX_ITERATIONS=15
  53. echo "TEST_RUNTIME: $TEST_RUNTIME"
  54. agents=(
  55. "DelegatorAgent"
  56. "ManagerAgent"
  57. "BrowsingAgent"
  58. "CodeActAgent"
  59. "PlannerAgent"
  60. "CodeActSWEAgent"
  61. )
  62. tasks=(
  63. "Fix typos in bad.txt."
  64. "Write a shell script 'hello.sh' that prints 'hello'."
  65. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  66. "Write a git commit message for the current staging area."
  67. "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
  68. "Browse localhost:8000, and tell me the ultimate answer to life."
  69. )
  70. test_names=(
  71. "test_edits"
  72. "test_write_simple_script"
  73. "test_ipython"
  74. "test_simple_task_rejection"
  75. "test_ipython_module"
  76. "test_browse_internet"
  77. )
  78. num_of_tests=${#test_names[@]}
  79. num_of_agents=${#agents[@]}
  80. ##############################################################
  81. ## FUNCTIONS ##
  82. ##############################################################
  83. # run integration test against a specific agent & test
  84. run_test() {
  85. # Ensure we're in the correct directory
  86. cd "$PROJECT_ROOT" || exit 1
  87. local pytest_cmd="poetry run pytest --cache-clear -vvsxx $SCRIPT_DIR/test_agent.py::$test_name"
  88. # Check if TEST_IN_CI is defined
  89. if [ -n "$TEST_IN_CI" ]; then
  90. pytest_cmd+=" --cov=agenthub --cov=opendevin --cov-report=xml --cov-append"
  91. fi
  92. env SCRIPT_DIR="$SCRIPT_DIR" \
  93. PROJECT_ROOT="$PROJECT_ROOT" \
  94. WORKSPACE_BASE=$WORKSPACE_BASE \
  95. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  96. MAX_ITERATIONS=$MAX_ITERATIONS \
  97. DEFAULT_AGENT=$agent \
  98. TEST_RUNTIME="$TEST_RUNTIME" \
  99. SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
  100. $pytest_cmd 2>&1 | tee $TMP_FILE
  101. # Capture the exit code of pytest
  102. pytest_exit_code=${PIPESTATUS[0]}
  103. if grep -q "docker.errors.DockerException" $TMP_FILE; then
  104. echo "Error: docker.errors.DockerException found in the output. Exiting."
  105. echo "Please check if your Docker daemon is running!"
  106. exit 1
  107. fi
  108. if grep -q "tenacity.RetryError" $TMP_FILE; then
  109. echo "Error: tenacity.RetryError found in the output. Exiting."
  110. echo "This is mostly a transient error. Please retry."
  111. exit 1
  112. fi
  113. if grep -q "ExceptionPxssh" $TMP_FILE; then
  114. echo "Error: ExceptionPxssh found in the output. Exiting."
  115. echo "Could not connect to sandbox via ssh. Please stop any stale docker container and retry."
  116. exit 1
  117. fi
  118. if grep -q "Address already in use" $TMP_FILE; then
  119. echo "Error: Address already in use found in the output. Exiting."
  120. echo "Browsing tests need a local http server. Please check if there's any zombie process running start_http_server.py."
  121. exit 1
  122. fi
  123. # Return the exit code of pytest
  124. return $pytest_exit_code
  125. }
  126. # browsing capability needs a local http server
  127. launch_http_server() {
  128. poetry run python $SCRIPT_DIR/start_http_server.py &
  129. HTTP_SERVER_PID=$!
  130. echo "Test http server launched, PID = $HTTP_SERVER_PID"
  131. sleep 10
  132. }
  133. cleanup() {
  134. echo "Cleaning up before exit..."
  135. if [ -n "$HTTP_SERVER_PID" ]; then
  136. echo "Killing HTTP server..."
  137. kill $HTTP_SERVER_PID || true
  138. unset HTTP_SERVER_PID
  139. fi
  140. [ -f $TMP_FILE ] && rm $TMP_FILE
  141. echo "Cleanup done!"
  142. }
  143. # Trap the EXIT signal to run the cleanup function
  144. trap cleanup EXIT
  145. # generate prompts again, using existing LLM responses under tests/integration/mock/[test_runtime]_runtime/[agent]/[test_name]/response_*.log
  146. # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
  147. # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
  148. # apply a small change to prompts.
  149. regenerate_without_llm() {
  150. # set -x to print the command being executed
  151. set -x
  152. env SCRIPT_DIR="$SCRIPT_DIR" \
  153. PROJECT_ROOT="$PROJECT_ROOT" \
  154. WORKSPACE_BASE=$WORKSPACE_BASE \
  155. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  156. MAX_ITERATIONS=$MAX_ITERATIONS \
  157. FORCE_APPLY_PROMPTS=true \
  158. DEFAULT_AGENT=$agent \
  159. TEST_RUNTIME="$TEST_RUNTIME" \
  160. SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
  161. poetry run pytest -s $SCRIPT_DIR/test_agent.py::$test_name
  162. set +x
  163. }
  164. regenerate_with_llm() {
  165. if [[ "$test_name" = "test_browse_internet" ]]; then
  166. launch_http_server
  167. fi
  168. rm -rf $WORKSPACE_BASE/*
  169. if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
  170. cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
  171. fi
  172. rm -rf logs
  173. rm -rf "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/*"
  174. # set -x to print the command being executed
  175. set -x
  176. echo -e "/exit\n" | \
  177. env SCRIPT_DIR="$SCRIPT_DIR" \
  178. PROJECT_ROOT="$PROJECT_ROOT" \
  179. DEBUG=true \
  180. WORKSPACE_BASE=$WORKSPACE_BASE \
  181. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  182. DEFAULT_AGENT=$agent \
  183. RUNTIME="$TEST_RUNTIME" \
  184. SANDBOX_CONTAINER_IMAGE="$SANDBOX_CONTAINER_IMAGE" \
  185. poetry run python "$PROJECT_ROOT/opendevin/core/main.py" \
  186. -i $MAX_ITERATIONS \
  187. -t "$task Do not ask me for confirmation at any point." \
  188. -c $agent
  189. set +x
  190. mkdir -p "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
  191. mv logs/llm/**/* "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name/"
  192. kill $HTTP_SERVER_PID || true
  193. }
  194. ##############################################################
  195. ## MAIN PROGRAM ##
  196. ##############################################################
  197. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  198. echo "Every task must correspond to one test case"
  199. exit 1
  200. fi
  201. rm -rf logs
  202. rm -rf $WORKSPACE_BASE/*
  203. for ((i = 0; i < num_of_tests; i++)); do
  204. task=${tasks[i]}
  205. test_name=${test_names[i]}
  206. # skip other tests if only one test is specified
  207. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  208. continue
  209. fi
  210. for ((j = 0; j < num_of_agents; j++)); do
  211. agent=${agents[j]}
  212. # skip other agents if only one agent is specified
  213. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  214. continue
  215. fi
  216. echo -e "\n\n\n\n========STEP 1: Running $test_name for $agent========\n\n\n\n"
  217. rm -rf $WORKSPACE_BASE/*
  218. if [ -d "$SCRIPT_DIR/workspace/$test_name" ]; then
  219. cp -r "$SCRIPT_DIR/workspace/$test_name"/* $WORKSPACE_BASE
  220. fi
  221. if [ "$TEST_ONLY" = true ]; then
  222. set -e
  223. else
  224. # Temporarily disable 'exit on error'
  225. set +e
  226. fi
  227. TEST_STATUS=1
  228. if [ -z $FORCE_REGENERATE ]; then
  229. run_test
  230. TEST_STATUS=$?
  231. fi
  232. # Re-enable 'exit on error'
  233. set -e
  234. if [[ $TEST_STATUS -ne 0 ]]; then
  235. if [ "$FORCE_USE_LLM" = true ]; then
  236. echo -e "\n\n\n\n========FORCE_USE_LLM, skipping step 2 & 3========\n\n\n\n"
  237. elif [ ! -d "$SCRIPT_DIR/mock/${TEST_RUNTIME}_runtime/$agent/$test_name" ]; then
  238. echo -e "\n\n\n\n========No existing mock responses for ${TEST_RUNTIME}_runtime/$agent/$test_name, skipping step 2 & 3========\n\n\n\n"
  239. else
  240. echo -e "\n\n\n\n========STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost========\n\n\n\n"
  241. # Temporarily disable 'exit on error'
  242. set +e
  243. regenerate_without_llm
  244. echo -e "\n\n\n\n========STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify========\n\n\n\n"
  245. run_test
  246. TEST_STATUS=$?
  247. # Re-enable 'exit on error'
  248. set -e
  249. fi
  250. if [[ $TEST_STATUS -ne 0 ]]; then
  251. echo -e "\n\n\n\n========STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost========\n\n\n\n"
  252. regenerate_with_llm
  253. echo -e "\n\n\n\n========STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify========\n\n\n\n"
  254. # Temporarily disable 'exit on error'
  255. set +e
  256. run_test
  257. TEST_STATUS=$?
  258. # Re-enable 'exit on error'
  259. set -e
  260. if [[ $TEST_STATUS -ne 0 ]]; then
  261. echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
  262. echo -e "There are multiple possibilities:"
  263. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  264. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  265. echo -e " 3. There is something non-deterministic in the prompt."
  266. echo -e " 4. There is a bug in this script, or in OpenDevin code."
  267. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  268. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  269. exit 1
  270. else
  271. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  272. sleep 1
  273. fi
  274. else
  275. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  276. sleep 1
  277. fi
  278. else
  279. echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
  280. sleep 1
  281. fi
  282. done
  283. done
  284. rm -rf logs
  285. rm -rf $WORKSPACE_BASE
  286. echo "Done!"