run_infer.sh 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. #!/bin/bash
  2. set -eo pipefail
  3. source "evaluation/utils/version_control.sh"
  4. MODEL_CONFIG=$1
  5. COMMIT_HASH=$2
  6. AGENT=$3
  7. EVAL_LIMIT=$4
  8. MAX_ITER=$5
  9. NUM_WORKERS=$6
  10. DATASET=$7
  11. SPLIT=$8
  12. N_RUNS=$9
  13. if [ -z "$NUM_WORKERS" ]; then
  14. NUM_WORKERS=1
  15. echo "Number of workers not specified, use default $NUM_WORKERS"
  16. fi
  17. checkout_eval_branch
  18. if [ -z "$AGENT" ]; then
  19. echo "Agent not specified, use default CodeActAgent"
  20. AGENT="CodeActAgent"
  21. fi
  22. if [ -z "$MAX_ITER" ]; then
  23. echo "MAX_ITER not specified, use default 100"
  24. MAX_ITER=100
  25. fi
  26. if [ -z "$USE_INSTANCE_IMAGE" ]; then
  27. echo "USE_INSTANCE_IMAGE not specified, use default true"
  28. USE_INSTANCE_IMAGE=true
  29. fi
  30. if [ -z "$RUN_WITH_BROWSING" ]; then
  31. echo "RUN_WITH_BROWSING not specified, use default false"
  32. RUN_WITH_BROWSING=false
  33. fi
  34. if [ -z "$DATASET" ]; then
  35. echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
  36. DATASET="princeton-nlp/SWE-bench_Lite"
  37. fi
  38. if [ -z "$SPLIT" ]; then
  39. echo "SPLIT not specified, use default test"
  40. SPLIT="test"
  41. fi
  42. export USE_INSTANCE_IMAGE=$USE_INSTANCE_IMAGE
  43. echo "USE_INSTANCE_IMAGE: $USE_INSTANCE_IMAGE"
  44. export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
  45. echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
  46. get_agent_version
  47. echo "AGENT: $AGENT"
  48. echo "AGENT_VERSION: $AGENT_VERSION"
  49. echo "MODEL_CONFIG: $MODEL_CONFIG"
  50. echo "DATASET: $DATASET"
  51. echo "SPLIT: $SPLIT"
  52. # Default to NOT use Hint
  53. if [ -z "$USE_HINT_TEXT" ]; then
  54. export USE_HINT_TEXT=false
  55. fi
  56. echo "USE_HINT_TEXT: $USE_HINT_TEXT"
  57. EVAL_NOTE="$AGENT_VERSION"
  58. # if not using Hint, add -no-hint to the eval note
  59. if [ "$USE_HINT_TEXT" = false ]; then
  60. EVAL_NOTE="$EVAL_NOTE-no-hint"
  61. fi
  62. if [ "$RUN_WITH_BROWSING" = true ]; then
  63. EVAL_NOTE="$EVAL_NOTE-with-browsing"
  64. fi
  65. if [ -n "$EXP_NAME" ]; then
  66. EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
  67. fi
  68. function run_eval() {
  69. local eval_note=$1
  70. COMMAND="poetry run python evaluation/swe_bench/run_infer.py \
  71. --agent-cls $AGENT \
  72. --llm-config $MODEL_CONFIG \
  73. --max-iterations $MAX_ITER \
  74. --eval-num-workers $NUM_WORKERS \
  75. --eval-note $eval_note \
  76. --dataset $DATASET \
  77. --split $SPLIT"
  78. if [ -n "$EVAL_LIMIT" ]; then
  79. echo "EVAL_LIMIT: $EVAL_LIMIT"
  80. COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
  81. fi
  82. # Run the command
  83. eval $COMMAND
  84. }
  85. unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
  86. if [ -z "$N_RUNS" ]; then
  87. N_RUNS=1
  88. echo "N_RUNS not specified, use default $N_RUNS"
  89. fi
  90. for i in $(seq 1 $N_RUNS); do
  91. current_eval_note="$EVAL_NOTE-run_$i"
  92. echo "EVAL_NOTE: $current_eval_note"
  93. run_eval $current_eval_note
  94. done
  95. checkout_original_branch