run_infer.sh 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. #!/bin/bash
  2. set -eo pipefail
  3. source "evaluation/utils/version_control.sh"
  4. MODEL_CONFIG=$1
  5. COMMIT_HASH=$2
  6. AGENT=$3
  7. EVAL_LIMIT=$4
  8. DATASET=$5
  9. HARDNESS=$6
  10. WOLFRAM_APPID=$7
  11. checkout_eval_branch
  12. if [ -z "$AGENT" ]; then
  13. echo "Agent not specified, use default CodeActAgent"
  14. AGENT="CodeActAgent"
  15. fi
  16. if [ -z "$DATASET" ]; then
  17. DATASET="flight"
  18. echo "Dataset not specified, use default $DATASET"
  19. fi
  20. if [ -z "$HARDNESS" ]; then
  21. HARDNESS="easy"
  22. echo "Hardness not specified, use default $HARDNESS"
  23. fi
  24. if [ -z "$WOLFRAM_APPID" ]; then
  25. WOLFRAM_APPID="YOUR_WOLFRAMALPHA_APPID"
  26. echo "WOLFRAM_APPID not specified"
  27. fi
  28. get_agent_version
  29. echo "AGENT: $AGENT"
  30. echo "AGENT_VERSION: $AGENT_VERSION"
  31. echo "MODEL_CONFIG: $MODEL_CONFIG"
  32. echo "DATASET: $DATASET"
  33. echo "HARDNESS: $HARDNESS"
  34. echo "WOLFRAM_APPID: $WOLFRAM_APPID"
  35. COMMAND="poetry run python evaluation/toolqa/run_infer.py \
  36. --agent-cls $AGENT \
  37. --llm-config $MODEL_CONFIG \
  38. --max-iterations 30 \
  39. --dataset $DATASET \
  40. --hardness $HARDNESS \
  41. --wolfram_alpha_appid $WOLFRAM_APPID\
  42. --data-split validation \
  43. --max-chars 10000000 \
  44. --eval-num-workers 1 \
  45. --eval-note ${AGENT_VERSION}_${LEVELS}"
  46. if [ -n "$EVAL_LIMIT" ]; then
  47. echo "EVAL_LIMIT: $EVAL_LIMIT"
  48. COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
  49. fi
  50. # Run the command
  51. eval $COMMAND