|
|
@@ -0,0 +1,158 @@
|
|
|
+name: Run Integration Tests
|
|
|
+
|
|
|
+on:
|
|
|
+ pull_request:
|
|
|
+ types: [labeled]
|
|
|
+ workflow_dispatch:
|
|
|
+ inputs:
|
|
|
+ reason:
|
|
|
+ description: 'Reason for manual trigger'
|
|
|
+ required: true
|
|
|
+ default: ''
|
|
|
+ schedule:
|
|
|
+ - cron: '30 22 * * *' # Runs at 10:30pm UTC every day
|
|
|
+
|
|
|
+env:
|
|
|
+ N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
|
|
|
+
|
|
|
+jobs:
|
|
|
+ run-integration-tests:
|
|
|
+ if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
|
|
|
+ runs-on: ubuntu-latest
|
|
|
+ permissions:
|
|
|
+ contents: "read"
|
|
|
+ id-token: "write"
|
|
|
+ pull-requests: "write"
|
|
|
+ issues: "write"
|
|
|
+ strategy:
|
|
|
+ matrix:
|
|
|
+ python-version: ["3.12"]
|
|
|
+ steps:
|
|
|
+ - name: Checkout repository
|
|
|
+ uses: actions/checkout@v4
|
|
|
+
|
|
|
+ - name: Install poetry via pipx
|
|
|
+ run: pipx install poetry
|
|
|
+
|
|
|
+ - name: Set up Python
|
|
|
+ uses: actions/setup-python@v5
|
|
|
+ with:
|
|
|
+ python-version: ${{ matrix.python-version }}
|
|
|
+ cache: "poetry"
|
|
|
+
|
|
|
+ - name: Comment on PR if 'integration-test' label is present
|
|
|
+ if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
|
|
|
+ uses: KeisukeYamashita/create-comment@v1
|
|
|
+ with:
|
|
|
+ unique: false
|
|
|
+ comment: |
|
|
|
+ Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
|
|
|
+
|
|
|
+ - name: Install Python dependencies using Poetry
|
|
|
+ run: poetry install --without evaluation,llama-index
|
|
|
+
|
|
|
+ - name: Configure config.toml for testing with Haiku
|
|
|
+ env:
|
|
|
+ LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
|
|
|
+ LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
|
|
+ LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
|
|
+ run: |
|
|
|
+ echo "[llm.eval]" > config.toml
|
|
|
+ echo "model = \"$LLM_MODEL\"" >> config.toml
|
|
|
+ echo "api_key = \"$LLM_API_KEY\"" >> config.toml
|
|
|
+ echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
|
|
|
+ echo "temperature = 0.0" >> config.toml
|
|
|
+
|
|
|
+ - name: Build environment
|
|
|
+ run: make build
|
|
|
+
|
|
|
+ - name: Run integration test evaluation for Haiku
|
|
|
+ env:
|
|
|
+ SANDBOX_FORCE_REBUILD_RUNTIME: True
|
|
|
+ run: |
|
|
|
+ poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
|
|
|
+
|
|
|
+ # get integration tests report
|
|
|
+ REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
|
|
|
+ echo "REPORT_FILE: $REPORT_FILE_HAIKU"
|
|
|
+ echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
|
|
|
+ cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
|
|
|
+ echo >> $GITHUB_ENV
|
|
|
+ echo "EOF" >> $GITHUB_ENV
|
|
|
+
|
|
|
+ - name: Wait a little bit
|
|
|
+ run: sleep 10
|
|
|
+
|
|
|
+ - name: Configure config.toml for testing with DeepSeek
|
|
|
+ env:
|
|
|
+ LLM_MODEL: "litellm_proxy/deepseek-chat"
|
|
|
+ LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
|
|
|
+ LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
|
|
|
+ run: |
|
|
|
+ echo "[llm.eval]" > config.toml
|
|
|
+ echo "model = \"$LLM_MODEL\"" >> config.toml
|
|
|
+ echo "api_key = \"$LLM_API_KEY\"" >> config.toml
|
|
|
+ echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
|
|
|
+ echo "temperature = 0.0" >> config.toml
|
|
|
+
|
|
|
+ - name: Run integration test evaluation for DeepSeek
|
|
|
+ env:
|
|
|
+ SANDBOX_FORCE_REBUILD_RUNTIME: True
|
|
|
+ run: |
|
|
|
+ poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
|
|
|
+
|
|
|
+ # get integration tests report
|
|
|
+ REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
|
|
|
+ echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
|
|
|
+ echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
|
|
|
+ cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
|
|
|
+ echo >> $GITHUB_ENV
|
|
|
+ echo "EOF" >> $GITHUB_ENV
|
|
|
+
|
|
|
+ - name: Create archive of evaluation outputs
|
|
|
+ run: |
|
|
|
+ TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
|
|
|
+ cd evaluation/evaluation_outputs/outputs # Change to the outputs directory
|
|
|
+ tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/* # Only include the actual result directories
|
|
|
+
|
|
|
+ - name: Upload evaluation results as artifact
|
|
|
+ uses: actions/upload-artifact@v4
|
|
|
+ id: upload_results_artifact
|
|
|
+ with:
|
|
|
+ name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
|
|
|
+ path: integration_tests_*.tar.gz
|
|
|
+
|
|
|
+ - name: Get artifact URLs
|
|
|
+ run: |
|
|
|
+ echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
|
|
|
+
|
|
|
+ - name: Set timestamp and trigger reason
|
|
|
+ run: |
|
|
|
+ echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
|
|
|
+ if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
|
|
+ echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
|
|
|
+ elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
|
|
|
+ echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
|
|
|
+ else
|
|
|
+ echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
|
|
|
+ fi
|
|
|
+
|
|
|
+ - name: Comment with results and artifact link
|
|
|
+ id: create_comment
|
|
|
+ uses: KeisukeYamashita/create-comment@v1
|
|
|
+ with:
|
|
|
+ # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
|
|
|
+ number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
|
|
|
+ unique: false
|
|
|
+ comment: |
|
|
|
+ Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
|
|
|
+ Commit: ${{ github.sha }}
|
|
|
+ **Integration Tests Report (Haiku)**
|
|
|
+ Haiku LLM Test Results:
|
|
|
+ ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
|
|
|
+ ---
|
|
|
+ **Integration Tests Report (DeepSeek)**
|
|
|
+ DeepSeek LLM Test Results:
|
|
|
+ ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
|
|
|
+ ---
|
|
|
+ Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
|