regenerate.sh 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #!/bin/bash
  2. set -eo pipefail
  3. ##############################################################
  4. ## CONSTANTS AND ENVIRONMENTAL VARIABLES ##
  5. ##############################################################
  6. if [ -z $WORKSPACE_MOUNT_PATH ]; then
  7. WORKSPACE_MOUNT_PATH=$(pwd)
  8. fi
  9. if [ -z $WORKSPACE_BASE ]; then
  10. WORKSPACE_BASE=$(pwd)
  11. fi
  12. WORKSPACE_MOUNT_PATH+="/_test_workspace"
  13. WORKSPACE_BASE+="/_test_workspace"
  14. WORKSPACE_MOUNT_PATH_IN_SANDBOX="/workspace"
  15. SANDBOX_TYPE="ssh"
  16. MAX_ITERATIONS=10
  17. agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
  18. tasks=(
  19. "Fix typos in bad.txt."
  20. "Write a shell script 'hello.sh' that prints 'hello'."
  21. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  22. "Install and import pymsgbox==1.0.9 and print it's version in /workspace/test.txt."
  23. )
  24. test_names=(
  25. "test_edits"
  26. "test_write_simple_script"
  27. "test_ipython"
  28. "test_ipython_module"
  29. )
  30. num_of_tests=${#test_names[@]}
  31. num_of_agents=${#agents[@]}
  32. ##############################################################
  33. ## FUNCTIONS ##
  34. ##############################################################
  35. # run integration test against a specific agent & test
  36. run_test() {
  37. SANDBOX_TYPE=$SANDBOX_TYPE \
  38. WORKSPACE_BASE=$WORKSPACE_BASE \
  39. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  40. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  41. MAX_ITERATIONS=$MAX_ITERATIONS \
  42. AGENT=$agent \
  43. poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  44. # return exit code of pytest
  45. return $?
  46. }
  47. # generate prompts again, using existing LLM responses under tests/integration/mock/[agent]/[test_name]/response_*.log
  48. # this is a compromise; the prompts might be non-sense yet still pass the test, because we don't use a real LLM to
  49. # respond to the prompts. The benefit is developers don't have to regenerate real responses from LLM, if they only
  50. # apply a small change to prompts.
  51. regenerate_without_llm() {
  52. # set -x to print the command being executed
  53. set -x
  54. SANDBOX_TYPE=$SANDBOX_TYPE \
  55. WORKSPACE_BASE=$WORKSPACE_BASE \
  56. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  57. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  58. MAX_ITERATIONS=$MAX_ITERATIONS \
  59. FORCE_APPLY_PROMPTS=true \
  60. AGENT=$agent \
  61. poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  62. set +x
  63. }
  64. regenerate_with_llm() {
  65. rm -rf $WORKSPACE_BASE
  66. mkdir -p $WORKSPACE_BASE
  67. if [ -d "tests/integration/workspace/$test_name" ]; then
  68. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  69. fi
  70. rm -rf logs
  71. rm -rf tests/integration/mock/$agent/$test_name/*
  72. # set -x to print the command being executed
  73. set -x
  74. echo -e "/exit\n" | \
  75. DEBUG=true \
  76. SANDBOX_TYPE=$SANDBOX_TYPE \
  77. WORKSPACE_BASE=$WORKSPACE_BASE \
  78. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
  79. WORKSPACE_MOUNT_PATH_IN_SANDBOX=$WORKSPACE_MOUNT_PATH_IN_SANDBOX \
  80. poetry run python ./opendevin/core/main.py \
  81. -i $MAX_ITERATIONS \
  82. -t "$task Do not ask me for confirmation at any point." \
  83. -c $agent
  84. set +x
  85. mkdir -p tests/integration/mock/$agent/$test_name/
  86. mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
  87. }
  88. ##############################################################
  89. ## MAIN PROGRAM ##
  90. ##############################################################
  91. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  92. echo "Every task must correspond to one test case"
  93. exit 1
  94. fi
  95. rm -rf logs
  96. rm -rf $WORKSPACE_BASE
  97. for ((i = 0; i < num_of_tests; i++)); do
  98. task=${tasks[i]}
  99. test_name=${test_names[i]}
  100. # skip other tests if only one test is specified
  101. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  102. continue
  103. fi
  104. for ((j = 0; j < num_of_agents; j++)); do
  105. agent=${agents[j]}
  106. # skip other agents if only one agent is specified
  107. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  108. continue
  109. fi
  110. echo -e "\n\n\n\n========STEP 1: Running $test_name for $agent========\n\n\n\n"
  111. rm -rf $WORKSPACE_BASE
  112. mkdir $WORKSPACE_BASE
  113. if [ -d "tests/integration/workspace/$test_name" ]; then
  114. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  115. fi
  116. if [ "$TEST_ONLY" = true ]; then
  117. set -e
  118. else
  119. # Temporarily disable 'exit on error'
  120. set +e
  121. fi
  122. TEST_STATUS=1
  123. if [ -z $SKIP_TEST ]; then
  124. run_test
  125. TEST_STATUS=$?
  126. fi
  127. # Re-enable 'exit on error'
  128. set -e
  129. if [[ $TEST_STATUS -ne 0 ]]; then
  130. if [ "$FORCE_USE_LLM" = true ]; then
  131. echo -e "\n\n\n\n========FORCE_USE_LLM, skipping step 2 & 3========\n\n\n\n"
  132. elif [ ! -d "tests/integration/mock/$agent/$test_name" ]; then
  133. echo -e "\n\n\n\n========No existing mock responses for $agent/$test_name, skipping step 2 & 3========\n\n\n\n"
  134. else
  135. echo -e "\n\n\n\n========STEP 2: $test_name failed, regenerating prompts for $agent WITHOUT money cost========\n\n\n\n"
  136. regenerate_without_llm
  137. echo -e "\n\n\n\n========STEP 3: $test_name prompts regenerated for $agent, rerun test again to verify========\n\n\n\n"
  138. # Temporarily disable 'exit on error'
  139. set +e
  140. run_test
  141. TEST_STATUS=$?
  142. # Re-enable 'exit on error'
  143. set -e
  144. fi
  145. if [[ $TEST_STATUS -ne 0 ]]; then
  146. echo -e "\n\n\n\n========STEP 4: $test_name failed, regenerating prompts and responses for $agent WITH money cost========\n\n\n\n"
  147. regenerate_with_llm
  148. echo -e "\n\n\n\n========STEP 5: $test_name prompts and responses regenerated for $agent, rerun test again to verify========\n\n\n\n"
  149. # Temporarily disable 'exit on error'
  150. set +e
  151. run_test
  152. TEST_STATUS=$?
  153. # Re-enable 'exit on error'
  154. set -e
  155. if [[ $TEST_STATUS -ne 0 ]]; then
  156. echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
  157. echo -e "There are multiple possibilities:"
  158. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  159. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  160. echo -e " 3. There is something non-deterministic in the prompt."
  161. echo -e " 4. There is a bug in this script, or in OpenDevin code."
  162. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  163. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  164. exit 1
  165. else
  166. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  167. sleep 1
  168. fi
  169. else
  170. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  171. sleep 1
  172. fi
  173. else
  174. echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
  175. sleep 1
  176. fi
  177. done
  178. done
  179. rm -rf logs
  180. rm -rf $WORKSPACE_BASE
  181. echo "Done!"