name: Run Evaluation on: pull_request: types: [labeled] schedule: - cron: "0 1 * * *" # Run daily at 1 AM UTC workflow_dispatch: inputs: reason: description: "Reason for manual trigger" required: true default: "" env: N_PROCESSES: 32 # Global configuration for number of parallel processes for evaluation jobs: run-evaluation: if: github.event.label.name == 'eval-this' || github.event_name != 'pull_request' runs-on: ubuntu-latest permissions: contents: "read" id-token: "write" pull-requests: "write" issues: "write" strategy: matrix: python-version: ["3.12"] steps: - name: Checkout repository uses: actions/checkout@v4 - name: Install poetry via pipx run: pipx install poetry - name: Set up Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "poetry" - name: Comment on PR if 'eval-this' label is present if: github.event_name == 'pull_request' && github.event.label.name == 'eval-this' uses: KeisukeYamashita/create-comment@v1 with: unique: false comment: | Hi! I started running the evaluation on your PR. You will receive a comment with the results shortly. - name: Install Python dependencies using Poetry run: poetry install - name: Configure config.toml for evaluation env: DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_LLM_API_KEY }} run: | echo "[llm.eval]" > config.toml echo "model = \"deepseek/deepseek-chat\"" >> config.toml echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml echo "temperature = 0.0" >> config.toml - name: Run integration test evaluation env: ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} RUNTIME: remote SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images run: | poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES # get evaluation report REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1) echo "REPORT_FILE: $REPORT_FILE" echo "INTEGRATION_TEST_REPORT<> $GITHUB_ENV cat $REPORT_FILE >> $GITHUB_ENV echo >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - name: Run SWE-Bench evaluation env: ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }} RUNTIME: remote SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images run: | poetry run ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test OUTPUT_FOLDER=$(find evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Lite-test/CodeActAgent -name "deepseek-chat_maxiter_50_N_*-no-hint-run_1" -type d | head -n 1) echo "OUTPUT_FOLDER for SWE-bench evaluation: $OUTPUT_FOLDER" poetry run ./evaluation/swe_bench/scripts/eval_infer_remote.sh $OUTPUT_FOLDER/output.jsonl $N_PROCESSES "princeton-nlp/SWE-bench_Lite" test poetry run ./evaluation/swe_bench/scripts/eval/summarize_outputs.py $OUTPUT_FOLDER/output.jsonl > summarize_outputs.log 2>&1 echo "SWEBENCH_REPORT<> $GITHUB_ENV cat summarize_outputs.log >> $GITHUB_ENV echo "EOF" >> $GITHUB_ENV - name: Create tar.gz of evaluation outputs run: | TIMESTAMP=$(date +'%y-%m-%d-%H-%M') tar -czvf evaluation_outputs_${TIMESTAMP}.tar.gz evaluation/evaluation_outputs/outputs - name: Upload evaluation results as artifact uses: actions/upload-artifact@v4 id: upload_results_artifact with: name: evaluation-outputs path: evaluation_outputs_*.tar.gz - name: Get artifact URL run: echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV - name: Authenticate to Google Cloud uses: 'google-github-actions/auth@v2' with: credentials_json: ${{ secrets.GCP_RESEARCH_OBJECT_CREATOR_SA_KEY }} - name: Set timestamp and trigger reason run: | echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV if [[ "${{ github.event_name }}" == "pull_request" ]]; then echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV elif [[ "${{ github.event_name }}" == "schedule" ]]; then echo "TRIGGER_REASON=schedule" >> $GITHUB_ENV else echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV fi - name: Upload evaluation results to Google Cloud Storage uses: 'google-github-actions/upload-cloud-storage@v2' with: path: 'evaluation/evaluation_outputs/outputs' destination: 'openhands-oss-eval-results/${{ env.TIMESTAMP }}-${{ env.TRIGGER_REASON }}' - name: Comment with evaluation results and artifact link id: create_comment uses: KeisukeYamashita/create-comment@v1 with: number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }} unique: false comment: | Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }} Commit: ${{ github.sha }} **SWE-Bench Evaluation Report** ${{ env.SWEBENCH_REPORT }} --- **Integration Tests Evaluation Report** ${{ env.INTEGRATION_TEST_REPORT }} --- You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}). - name: Post to a Slack channel id: slack uses: slackapi/slack-github-action@v1.27.0 with: channel-id: 'C07SVQSCR6F' slack-message: "*Evaluation Trigger:* ${{ github.event_name == 'pull_request' && format('Pull Request (eval-this label on PR #{0})', github.event.pull_request.number) || github.event_name == 'schedule' && 'Daily Schedule' || format('Manual Trigger: {0}', github.event.inputs.reason) }}\n\nLink to summary: [here](https://github.com/${{ github.repository }}/issues/${{ github.event_name == 'pull_request' && github.event.pull_request.number || 4504 }}#issuecomment-${{ steps.create_comment.outputs.comment-id }})" env: SLACK_BOT_TOKEN: ${{ secrets.EVAL_NOTIF_SLACK_BOT_TOKEN }}