eval-runner.yml 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. name: Run Evaluation
  2. on:
  3. pull_request:
  4. types: [labeled]
  5. schedule:
  6. - cron: "0 1 * * *" # Run daily at 1 AM UTC
  7. workflow_dispatch:
  8. inputs:
  9. reason:
  10. description: "Reason for manual trigger"
  11. required: true
  12. default: ""
  13. env:
  14. N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation
  15. jobs:
  16. run-evaluation:
  17. if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request'
  18. runs-on: ubuntu-latest
  19. permissions:
  20. contents: "read"
  21. id-token: "write"
  22. pull-requests: "write"
  23. issues: "write"
  24. strategy:
  25. matrix:
  26. python-version: ["3.12"]
  27. steps:
  28. - name: Checkout repository
  29. uses: actions/checkout@v4
  30. - name: Install poetry via pipx
  31. run: pipx install poetry
  32. - name: Set up Python
  33. uses: actions/setup-python@v5
  34. with:
  35. python-version: ${{ matrix.python-version }}
  36. cache: "poetry"
  37. - name: Comment on PR if 'eval-this' label is present
  38. if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this'
  39. uses: KeisukeYamashita/create-comment@v1
  40. with:
  41. unique: false
  42. comment: |
  43. Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly.
  44. - name: Install Python dependencies using Poetry
  45. run: poetry install
  46. - name: Configure config.toml for evaluation
  47. env:
  48. DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }}
  49. run: |
  50. echo "[llm.eval]" > config.toml
  51. echo "model = \"deepseek/deepseek-chat\"" >> config.toml
  52. echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
  53. echo "temperature = 0.0" >> config.toml
  54. - name: Run integration test evaluation
  55. env:
  56. ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
  57. RUNTIME: remote
  58. SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
  59. EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
  60. run: |
  61. poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
  62. # get evaluation report
  63. REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
  64. echo "REPORT_FILE: $REPORT_FILE"
  65. echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
  66. cat $REPORT_FILE >> $GITHUB_ENV
  67. echo >> $GITHUB_ENV
  68. echo "EOF" >> $GITHUB_ENV
  69. - name: Run SWE-Bench evaluation
  70. env:
  71. ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
  72. RUNTIME: remote
  73. SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
  74. EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
  75. run: |
  76. poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
  77. OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1)
  78. echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER"
  79. poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test
  80. poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1
  81. echo "SWEBENCH_REPORT<<EOF" >> $GITHUB_ENV
  82. cat summarize_outputs.log >> $GITHUB_ENV
  83. echo "EOF" >> $GITHUB_ENV
  84. - name: Create tar.gz of evaluation outputs
  85. run: |
  86. TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
  87. tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs
  88. - name: Upload evaluation results as artifact
  89. uses: actions/upload-artifact@v4
  90. id: upload_results_artifact
  91. with:
  92. name: evaluation-outputs
  93. path: evaluation_outputs_*.tar.gz
  94. - name: Get artifact URL
  95. run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
  96. - name: Authenticate to Google Cloud
  97. uses: 'google-github-actions/auth@v2'
  98. with:
  99. credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }}
  100. - name: Set timestamp and trigger reason
  101. run: |
  102. echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
  103. if [[ "${{ github.event_name }}" == "pull_request" ]]; then
  104. echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
  105. elif [[ "${{ github.event_name }}" == "schedule" ]]; then
  106. echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV
  107. else
  108. echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
  109. fi
  110. - name: Upload evaluation results to Google Cloud Storage
  111. uses: 'google-github-actions/upload-cloud-storage@v2'
  112. with:
  113. path: 'evaluation/evaluation_outputs/outputs'
  114. destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}'
  115. - name: Comment with evaluation results and artifact link
  116. id: create_comment
  117. uses: KeisukeYamashita/create-comment@v1
  118. with:
  119. number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}
  120. unique: false
  121. comment: |
  122. Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}
  123. Commit: ${{ github.sha }}
  124. **SWE-Bench Evaluation Report**
  125. ${{ env.SWEBENCH_REPORT }}
  126. ---
  127. **Integration Tests Evaluation Report**
  128. ${{ env.INTEGRATION_TEST_REPORT }}
  129. ---
  130. You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
  131. - name: Post to a Slack channel
  132. id: slack
  133. uses: slackapi/slack-github-action@v1.27.0
  134. with:
  135. channel-id: 'C07SVQSCR6F'
  136. slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})"
  137. env:
  138. SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}