regenerate.sh 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/bin/bash
  2. set -eo pipefail
  3. run_test() {
  4. SANDBOX_TYPE=$SANDBOX_TYPE \
  5. WORKSPACE_BASE=$WORKSPACE_BASE \
  6. REMIND_ITERATIONS=$remind_iterations \
  7. MAX_ITERATIONS=$MAX_ITERATIONS \
  8. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  9. AGENT=$agent \
  10. poetry run pytest -s ./tests/integration/test_agent.py::$test_name
  11. # return exit code of pytest
  12. return $?
  13. }
  14. if [ -z $WORKSPACE_MOUNT_PATH ]; then
  15. WORKSPACE_MOUNT_PATH=$(pwd)
  16. fi
  17. if [ -z $WORKSPACE_BASE ]; then
  18. WORKSPACE_BASE=$(pwd)
  19. fi
  20. WORKSPACE_MOUNT_PATH+="/_test_workspace"
  21. WORKSPACE_BASE+="/_test_workspace"
  22. SANDBOX_TYPE="ssh"
  23. MAX_ITERATIONS=10
  24. agents=("MonologueAgent" "CodeActAgent" "PlannerAgent" "SWEAgent")
  25. remind_iterations_config=(false true false false)
  26. tasks=(
  27. "Fix typos in bad.txt."
  28. "Write a shell script 'hello.sh' that prints 'hello'."
  29. "Use Jupyter IPython to write a text file containing 'hello world' to '/workspace/test.txt'."
  30. )
  31. test_names=(
  32. "test_edits"
  33. "test_write_simple_script"
  34. "test_ipython"
  35. )
  36. num_of_tests=${#test_names[@]}
  37. num_of_agents=${#agents[@]}
  38. if [ "$num_of_agents" -ne "${#remind_iterations_config[@]}" ]; then
  39. echo "Every agent must have its own remind_iterations_config"
  40. exit 1
  41. fi
  42. if [ "$num_of_tests" -ne "${#test_names[@]}" ]; then
  43. echo "Every task must correspond to one test case"
  44. exit 1
  45. fi
  46. rm -rf logs
  47. rm -rf $WORKSPACE_BASE
  48. for ((i = 0; i < num_of_tests; i++)); do
  49. task=${tasks[i]}
  50. test_name=${test_names[i]}
  51. # skip other tests if only one test is specified
  52. if [[ -n "$ONLY_TEST_NAME" && "$ONLY_TEST_NAME" != "$test_name" ]]; then
  53. continue
  54. fi
  55. for ((j = 0; j < num_of_agents; j++)); do
  56. agent=${agents[j]}
  57. # skip other agents if only one agent is specified
  58. if [[ -n "$ONLY_TEST_AGENT" && "$ONLY_TEST_AGENT" != "$agent" ]]; then
  59. continue
  60. fi
  61. echo -e "\n\n\n\n========Running $test_name for $agent========\n\n\n\n"
  62. rm -rf $WORKSPACE_BASE
  63. mkdir $WORKSPACE_BASE
  64. if [ -d "tests/integration/workspace/$test_name" ]; then
  65. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  66. fi
  67. if [ "$TEST_ONLY" = true ]; then
  68. set -e
  69. else
  70. # Temporarily disable 'exit on error'
  71. set +e
  72. fi
  73. run_test
  74. TEST_STATUS=$?
  75. # Re-enable 'exit on error'
  76. set -e
  77. if [[ $TEST_STATUS -ne 0 ]]; then
  78. echo -e "\n\n\n\n========$test_name failed, regenerating test data for $agent========\n\n\n\n"
  79. sleep 1
  80. rm -rf $WORKSPACE_BASE
  81. mkdir -p $WORKSPACE_BASE
  82. if [ -d "tests/integration/workspace/$test_name" ]; then
  83. cp -r tests/integration/workspace/$test_name/* $WORKSPACE_BASE
  84. fi
  85. rm -rf logs
  86. rm -rf tests/integration/mock/$agent/$test_name/*
  87. # set -x to print the command being executed
  88. set -x
  89. echo -e "/exit\n" | \
  90. SANDBOX_TYPE=$SANDBOX_TYPE \
  91. WORKSPACE_BASE=$WORKSPACE_BASE \
  92. DEBUG=true \
  93. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH AGENT=$agent \
  94. REMIND_ITERATIONS=$remind_iterations \
  95. WORKSPACE_MOUNT_PATH=$WORKSPACE_MOUNT_PATH \
  96. AGENT=$agent \
  97. poetry run python ./opendevin/core/main.py \
  98. -i $MAX_ITERATIONS \
  99. -t "$task Do not ask me for confirmation at any point." \
  100. -c $agent
  101. set +x
  102. mkdir -p tests/integration/mock/$agent/$test_name/
  103. mv logs/llm/**/* tests/integration/mock/$agent/$test_name/
  104. echo -e "\n\n\n\n========$test_name test data regenerated for $agent, rerun test again to verify========\n\n\n\n"
  105. # Temporarily disable 'exit on error'
  106. set +e
  107. run_test
  108. TEST_STATUS=$?
  109. # Re-enable 'exit on error'
  110. set -e
  111. if [[ $TEST_STATUS -ne 0 ]]; then
  112. echo -e "\n\n\n\n========$test_name for $agent RERUN FAILED========\n\n\n\n"
  113. echo -e "There are multiple possibilities:"
  114. echo -e " 1. The agent is unable to finish the task within $MAX_ITERATIONS steps."
  115. echo -e " 2. The agent thinks itself has finished the task, but fails the validation in the test code."
  116. echo -e " 3. There is something non-deterministic in the prompt."
  117. echo -e " 4. There is a bug in this script, or in OpenDevin code."
  118. echo -e "NOTE: Some of the above problems could sometimes be fixed by a retry (with a more powerful LLM)."
  119. echo -e " You could also consider improving the agent, increasing MAX_ITERATIONS, or skipping this test for this agent."
  120. exit 1
  121. else
  122. echo -e "\n\n\n\n========$test_name for $agent RERUN PASSED========\n\n\n\n"
  123. sleep 1
  124. fi
  125. else
  126. echo -e "\n\n\n\n========$test_name for $agent PASSED========\n\n\n\n"
  127. sleep 1
  128. fi
  129. done
  130. done
  131. rm -rf logs
  132. rm -rf $WORKSPACE_BASE
  133. echo "Done!"