run_infer.sh 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. #!/bin/bash
  2. set -eo pipefail
  3. source "evaluation/utils/version_control.sh"
  4. MODEL_CONFIG=$1
  5. COMMIT_HASH=$2
  6. AGENT=$3
  7. EVAL_LIMIT=$4
  8. NUM_WORKERS=$5
  9. if [ -z "$NUM_WORKERS" ]; then
  10. NUM_WORKERS=1
  11. echo "Number of workers not specified, use default $NUM_WORKERS"
  12. fi
  13. echo "
  14. ################################################################################
  15. !!!WARNING!!!
  16. ################################################################################
  17. The "code_eval" metric executes untrusted model-generated code in Python.
  18. Although it is highly unlikely that model-generated code will do something
  19. overtly malicious in response to this test suite, model-generated code may act
  20. destructively due to a lack of model capability or alignment.
  21. Users are strongly encouraged to sandbox this evaluation suite so that it
  22. does not perform destructive actions on their host or network. For more
  23. information on how OpenAI sandboxes its code, see the paper \"Evaluating Large
  24. Language Models Trained on Code\" (https://arxiv.org/abs/2107.03374).
  25. Once you have read this disclaimer and taken appropriate precautions,
  26. set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
  27. with:
  28. >>> import os
  29. >>> os.environ[\"HF_ALLOW_CODE_EVAL\"] = \"1\"
  30. ################################################################################
  31. "
  32. echo "WARNING: You are about to enable the execution of untrusted model-generated code by setting the environment variable HF_ALLOW_CODE_EVAL to '1'."
  33. echo "It is highly unlikely that model-generated code will do something overtly malicious in response to this test suite, however, it may act destructively due to a lack of model capability or alignment."
  34. echo "Please confirm that you have read the disclaimer, taken the necessary precautions, and wish to proceed (y/n):"
  35. read user_input
  36. if [ "$user_input" = "y" ]; then
  37. export HF_ALLOW_CODE_EVAL="1"
  38. echo "Environment variable HF_ALLOW_CODE_EVAL has been set to '1'."
  39. else
  40. echo "Operation aborted. Environment variable HF_ALLOW_CODE_EVAL has not been set."
  41. exit 1
  42. fi
  43. # ################################################################################
  44. checkout_eval_branch
  45. if [ -z "$AGENT" ]; then
  46. echo "Agent not specified, use default CodeActAgent"
  47. AGENT="CodeActAgent"
  48. fi
  49. get_agent_version
  50. echo "AGENT: $AGENT"
  51. echo "AGENT_VERSION: $AGENT_VERSION"
  52. echo "MODEL_CONFIG: $MODEL_CONFIG"
  53. COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
  54. --agent-cls $AGENT \
  55. --llm-config $MODEL_CONFIG \
  56. --max-iterations 10 \
  57. --eval-num-workers $NUM_WORKERS \
  58. --eval-note $AGENT_VERSION"
  59. if [ -n "$EVAL_LIMIT" ]; then
  60. echo "EVAL_LIMIT: $EVAL_LIMIT"
  61. COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
  62. fi
  63. # Run the command
  64. eval $COMMAND