run_infer.sh 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/bin/bash
  2. set -eo pipefail
  3. source "evaluation/utils/version_control.sh"
  4. MODEL_CONFIG=$1
  5. COMMIT_HASH=$2
  6. AGENT=$3
  7. EVAL_LIMIT=$4
  8. echo "
  9. ################################################################################
  10. !!!WARNING!!!
  11. ################################################################################
  12. The "code_eval" metric executes untrusted model-generated code in Python.
  13. Although it is highly unlikely that model-generated code will do something
  14. overtly malicious in response to this test suite, model-generated code may act
  15. destructively due to a lack of model capability or alignment.
  16. Users are strongly encouraged to sandbox this evaluation suite so that it
  17. does not perform destructive actions on their host or network. For more
  18. information on how OpenAI sandboxes its code, see the paper \"Evaluating Large
  19. Language Models Trained on Code\" (https://arxiv.org/abs/2107.03374).
  20. Once you have read this disclaimer and taken appropriate precautions,
  21. set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
  22. with:
  23. >>> import os
  24. >>> os.environ[\"HF_ALLOW_CODE_EVAL\"] = \"1\"
  25. ################################################################################
  26. "
  27. echo "WARNING: You are about to enable the execution of untrusted model-generated code by setting the environment variable HF_ALLOW_CODE_EVAL to '1'."
  28. echo "It is highly unlikely that model-generated code will do something overtly malicious in response to this test suite, however, it may act destructively due to a lack of model capability or alignment."
  29. echo "Please confirm that you have read the disclaimer, taken the necessary precautions, and wish to proceed (y/n):"
  30. read user_input
  31. if [ "$user_input" = "y" ]; then
  32. export HF_ALLOW_CODE_EVAL="1"
  33. echo "Environment variable HF_ALLOW_CODE_EVAL has been set to '1'."
  34. else
  35. echo "Operation aborted. Environment variable HF_ALLOW_CODE_EVAL has not been set."
  36. exit 1
  37. fi
  38. # ################################################################################
  39. checkout_eval_branch
  40. if [ -z "$AGENT" ]; then
  41. echo "Agent not specified, use default CodeActAgent"
  42. AGENT="CodeActAgent"
  43. fi
  44. get_agent_version
  45. echo "AGENT: $AGENT"
  46. echo "AGENT_VERSION: $AGENT_VERSION"
  47. echo "MODEL_CONFIG: $MODEL_CONFIG"
  48. COMMAND="poetry run python evaluation/humanevalfix/run_infer.py \
  49. --agent-cls $AGENT \
  50. --llm-config $MODEL_CONFIG \
  51. --max-iterations 10 \
  52. --max-chars 10000000 \
  53. --eval-num-workers 1 \
  54. --eval-note $AGENT_VERSION"
  55. if [ -n "$EVAL_LIMIT" ]; then
  56. echo "EVAL_LIMIT: $EVAL_LIMIT"
  57. COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
  58. fi
  59. # Run the command
  60. eval $COMMAND