hai 1 ano · f0ca2239f3
--- a/.github/workflows/eval-runner.yml
+++ b/.github/workflows/eval-runner.yml
@@ -1,4 +1,4 @@
 
				-name: Run Evaluation
			
 
				+name: Run SWE-Bench Evaluation
			
 
				 
			
 
				 on:
			
 
				   pull_request:
			
@@ -58,24 +58,6 @@ jobs:
 
				           echo "api_key = \"$DEEPSEEK_API_KEY\"" >> config.toml
			
 
				           echo "temperature = 0.0" >> config.toml
			
 
				 
			
 
				-      - name: Run integration test evaluation
			
 
				-        env:
			
 
				-          ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
			
 
				-          RUNTIME: remote
			
 
				-          SANDBOX_REMOTE_RUNTIME_API_URL: https://runtime.eval.all-hands.dev
			
 
				-          EVAL_DOCKER_IMAGE_PREFIX: us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images
			
 
				-
			
 
				-        run: |
			
 
				-          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES
			
 
				-
			
 
				-          # get evaluation report
			
 
				-          REPORT_FILE=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek-chat_maxiter_10_N* -name "report.md" -type f | head -n 1)
			
 
				-          echo "REPORT_FILE: $REPORT_FILE"
			
 
				-          echo "INTEGRATION_TEST_REPORT<<EOF" >> $GITHUB_ENV
			
 
				-          cat $REPORT_FILE >> $GITHUB_ENV
			
 
				-          echo >> $GITHUB_ENV
			
 
				-          echo "EOF" >> $GITHUB_ENV
			
 
				-
			
 
				       - name: Run SWE-Bench evaluation
			
 
				         env:
			
 
				           ALLHANDS_API_KEY: ${{ secrets.ALLHANDS_EVAL_RUNTIME_API_KEY }}
			
@@ -143,9 +125,6 @@ jobs:
 
				               **SWE-Bench Evaluation Report**
			
 
				               ${{ env.SWEBENCH_REPORT }}
			
 
				               ---
			
 
				-              **Integration Tests Evaluation Report**
			
 
				-              ${{ env.INTEGRATION_TEST_REPORT }}
			
 
				-              ---
			
 
				               You can download the full evaluation outputs [here](${{ env.ARTIFACT_URL }}).
			
 
				 
			
 
				       - name: Post to a Slack channel
			
--- a/.github/workflows/integration-runner.yml
+++ b/.github/workflows/integration-runner.yml
@@ -0,0 +1,158 @@
 
				+name: Run Integration Tests
			
 
				+
			
 
				+on:
			
 
				+  pull_request:
			
 
				+    types: [labeled]
			
 
				+  workflow_dispatch:
			
 
				+    inputs:
			
 
				+      reason:
			
 
				+        description: 'Reason for manual trigger'
			
 
				+        required: true
			
 
				+        default: ''
			
 
				+  schedule:
			
 
				+    - cron: '30 22 * * *'  # Runs at 10:30pm UTC every day
			
 
				+
			
 
				+env:
			
 
				+  N_PROCESSES: 10 # Global configuration for number of parallel processes for evaluation
			
 
				+
			
 
				+jobs:
			
 
				+  run-integration-tests:
			
 
				+    if: github.event.label.name == 'integration-test' || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule'
			
 
				+    runs-on: ubuntu-latest
			
 
				+    permissions:
			
 
				+      contents: "read"
			
 
				+      id-token: "write"
			
 
				+      pull-requests: "write"
			
 
				+      issues: "write"
			
 
				+    strategy:
			
 
				+      matrix:
			
 
				+        python-version: ["3.12"]
			
 
				+    steps:
			
 
				+      - name: Checkout repository
			
 
				+        uses: actions/checkout@v4
			
 
				+
			
 
				+      - name: Install poetry via pipx
			
 
				+        run: pipx install poetry
			
 
				+
			
 
				+      - name: Set up Python
			
 
				+        uses: actions/setup-python@v5
			
 
				+        with:
			
 
				+          python-version: ${{ matrix.python-version }}
			
 
				+          cache: "poetry"
			
 
				+
			
 
				+      - name: Comment on PR if 'integration-test' label is present
			
 
				+        if: github.event_name == 'pull_request' && github.event.label.name == 'integration-test'
			
 
				+        uses: KeisukeYamashita/create-comment@v1
			
 
				+        with:
			
 
				+          unique: false
			
 
				+          comment: |
			
 
				+            Hi! I started running the integration tests on your PR. You will receive a comment with the results shortly.
			
 
				+
			
 
				+      - name: Install Python dependencies using Poetry
			
 
				+        run: poetry install --without evaluation,llama-index
			
 
				+
			
 
				+      - name: Configure config.toml for testing with Haiku
			
 
				+        env:
			
 
				+          LLM_MODEL: "litellm_proxy/claude-3-5-haiku-20241022"
			
 
				+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
			
 
				+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
			
 
				+        run: |
			
 
				+          echo "[llm.eval]" > config.toml
			
 
				+          echo "model = \"$LLM_MODEL\"" >> config.toml
			
 
				+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
			
 
				+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
			
 
				+          echo "temperature = 0.0" >> config.toml
			
 
				+
			
 
				+      - name: Build environment
			
 
				+        run: make build
			
 
				+
			
 
				+      - name: Run integration test evaluation for Haiku
			
 
				+        env:
			
 
				+          SANDBOX_FORCE_REBUILD_RUNTIME: True
			
 
				+        run: |
			
 
				+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'haiku_run'
			
 
				+
			
 
				+          # get integration tests report
			
 
				+          REPORT_FILE_HAIKU=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/*haiku*_maxiter_10_N* -name "report.md" -type f | head -n 1)
			
 
				+          echo "REPORT_FILE: $REPORT_FILE_HAIKU"
			
 
				+          echo "INTEGRATION_TEST_REPORT_HAIKU<<EOF" >> $GITHUB_ENV
			
 
				+          cat $REPORT_FILE_HAIKU >> $GITHUB_ENV
			
 
				+          echo >> $GITHUB_ENV
			
 
				+          echo "EOF" >> $GITHUB_ENV
			
 
				+
			
 
				+      - name: Wait a little bit
			
 
				+        run: sleep 10
			
 
				+
			
 
				+      - name: Configure config.toml for testing with DeepSeek
			
 
				+        env:
			
 
				+          LLM_MODEL: "litellm_proxy/deepseek-chat"
			
 
				+          LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
			
 
				+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
			
 
				+        run: |
			
 
				+          echo "[llm.eval]" > config.toml
			
 
				+          echo "model = \"$LLM_MODEL\"" >> config.toml
			
 
				+          echo "api_key = \"$LLM_API_KEY\"" >> config.toml
			
 
				+          echo "base_url = \"$LLM_BASE_URL\"" >> config.toml
			
 
				+          echo "temperature = 0.0" >> config.toml
			
 
				+
			
 
				+      - name: Run integration test evaluation for DeepSeek
			
 
				+        env:
			
 
				+          SANDBOX_FORCE_REBUILD_RUNTIME: True
			
 
				+        run: |
			
 
				+          poetry run ./evaluation/integration_tests/scripts/run_infer.sh llm.eval HEAD CodeActAgent '' $N_PROCESSES '' 'deepseek_run'
			
 
				+
			
 
				+          # get integration tests report
			
 
				+          REPORT_FILE_DEEPSEEK=$(find evaluation/evaluation_outputs/outputs/integration_tests/CodeActAgent/deepseek*_maxiter_10_N* -name "report.md" -type f | head -n 1)
			
 
				+          echo "REPORT_FILE: $REPORT_FILE_DEEPSEEK"
			
 
				+          echo "INTEGRATION_TEST_REPORT_DEEPSEEK<<EOF" >> $GITHUB_ENV
			
 
				+          cat $REPORT_FILE_DEEPSEEK >> $GITHUB_ENV
			
 
				+          echo >> $GITHUB_ENV
			
 
				+          echo "EOF" >> $GITHUB_ENV
			
 
				+
			
 
				+      - name: Create archive of evaluation outputs
			
 
				+        run: |
			
 
				+          TIMESTAMP=$(date +'%y-%m-%d-%H-%M')
			
 
				+          cd evaluation/evaluation_outputs/outputs  # Change to the outputs directory
			
 
				+          tar -czvf ../../../integration_tests_${TIMESTAMP}.tar.gz integration_tests/CodeActAgent/*  # Only include the actual result directories
			
 
				+
			
 
				+      - name: Upload evaluation results as artifact
			
 
				+        uses: actions/upload-artifact@v4
			
 
				+        id: upload_results_artifact
			
 
				+        with:
			
 
				+          name: integration-test-outputs-${{ github.run_id }}-${{ github.run_attempt }}
			
 
				+          path: integration_tests_*.tar.gz
			
 
				+
			
 
				+      - name: Get artifact URLs
			
 
				+        run: |
			
 
				+          echo "ARTIFACT_URL=${{ steps.upload_results_artifact.outputs.artifact-url }}" >> $GITHUB_ENV
			
 
				+
			
 
				+      - name: Set timestamp and trigger reason
			
 
				+        run: |
			
 
				+          echo "TIMESTAMP=$(date +'%Y-%m-%d-%H-%M')" >> $GITHUB_ENV
			
 
				+          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
			
 
				+            echo "TRIGGER_REASON=pr-${{ github.event.pull_request.number }}" >> $GITHUB_ENV
			
 
				+          elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
			
 
				+            echo "TRIGGER_REASON=manual-${{ github.event.inputs.reason }}" >> $GITHUB_ENV
			
 
				+          else
			
 
				+            echo "TRIGGER_REASON=nightly-scheduled" >> $GITHUB_ENV
			
 
				+          fi
			
 
				+
			
 
				+      - name: Comment with results and artifact link
			
 
				+        id: create_comment
			
 
				+        uses: KeisukeYamashita/create-comment@v1
			
 
				+        with:
			
 
				+          # if triggered by PR, use PR number, otherwise use 5077 as fallback issue number for manual triggers
			
 
				+          number: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 5077 }}
			
 
				+          unique: false
			
 
				+          comment: |
			
 
				+              Trigger by: ${{ github.event_name == 'pull_request' && format('Pull Request (integration-test label on PR #{0})', github.event.pull_request.number) || (github.event_name == 'workflow_dispatch' && format('Manual Trigger: {0}', github.event.inputs.reason)) || 'Nightly Scheduled Run' }}
			
 
				+              Commit: ${{ github.sha }}
			
 
				+              **Integration Tests Report (Haiku)**
			
 
				+              Haiku LLM Test Results:
			
 
				+              ${{ env.INTEGRATION_TEST_REPORT_HAIKU }}
			
 
				+              ---
			
 
				+              **Integration Tests Report (DeepSeek)**
			
 
				+              DeepSeek LLM Test Results:
			
 
				+              ${{ env.INTEGRATION_TEST_REPORT_DEEPSEEK }}
			
 
				+              ---
			
 
				+              Download evaluation outputs (includes both Haiku and DeepSeek results): [Download](${{ steps.upload_results_artifact.outputs.artifact-url }})
			
--- a/evaluation/integration_tests/run_infer.py
+++ b/evaluation/integration_tests/run_infer.py
@@ -48,13 +48,19 @@ def get_config(
 
				             # use default base_container_image
			
 
				             enable_auto_lint=True,
			
 
				             use_host_network=False,
			
 
				-            timeout=100,
			
 
				+            timeout=300,
			
 
				+            # Add platform to the sandbox config to solve issue 4401
			
 
				+            platform='linux/amd64',
			
 
				             api_key=os.environ.get('ALLHANDS_API_KEY', None),
			
 
				             remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
			
 
				+            keep_runtime_alive=False,
			
 
				+            remote_runtime_init_timeout=3600,
			
 
				         ),
			
 
				         # do not mount workspace
			
 
				         workspace_base=None,
			
 
				         workspace_mount_path=None,
			
 
				+        # debug
			
 
				+        debug=True,
			
 
				     )
			
 
				     config.set_llm_config(
			
 
				         update_llm_config_for_completions_logging(
			
@@ -129,7 +135,12 @@ def process_instance(
 
				     # # result evaluation
			
 
				     # # =============================================
			
 
				 
			
 
				-    histories = [event_to_dict(event) for event in state.history]
			
 
				+    histories = state.history
			
 
				+
			
 
				+    # some basic check
			
 
				+    logger.info(f'Total events in history: {len(histories)}')
			
 
				+    assert len(histories) > 0, 'History should not be empty'
			
 
				+
			
 
				     test_result: TestResult = test_class.verify_result(runtime, histories)
			
 
				     metrics = state.metrics.get() if state.metrics else None
			
 
				 
			
@@ -139,7 +150,7 @@ def process_instance(
 
				         instance=instance.to_dict(),
			
 
				         instruction=instruction,
			
 
				         metadata=metadata,
			
 
				-        history=histories,
			
 
				+        history=[event_to_dict(event) for event in histories],
			
 
				         metrics=metrics,
			
 
				         error=state.last_error if state and state.last_error else None,
			
 
				         test_result=test_result.model_dump(),
			
--- a/evaluation/integration_tests/tests/t05_simple_browsing.py
+++ b/evaluation/integration_tests/tests/t05_simple_browsing.py
@@ -108,6 +108,8 @@ class Test(BaseIntegrationTest):
 
				 
			
 
				     @classmethod
			
 
				     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
			
 
				+        from openhands.core.logger import openhands_logger as logger
			
 
				+
			
 
				         # check if the "The answer is OpenHands is all you need!" is in any message
			
 
				         message_actions = [
			
 
				             event
			
@@ -116,19 +118,29 @@ class Test(BaseIntegrationTest):
 
				                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
			
 
				             )
			
 
				         ]
			
 
				+        logger.debug(f'Total message-like events: {len(message_actions)}')
			
 
				+
			
 
				         for event in message_actions:
			
 
				-            if isinstance(event, AgentDelegateObservation):
			
 
				-                content = event.content
			
 
				-            elif isinstance(event, AgentFinishAction):
			
 
				-                content = event.outputs.get('content', '')
			
 
				-            elif isinstance(event, MessageAction):
			
 
				-                content = event.content
			
 
				-            else:
			
 
				-                raise ValueError(f'Unknown event type: {type(event)}')
			
 
				+            try:
			
 
				+                if isinstance(event, AgentDelegateObservation):
			
 
				+                    content = event.content
			
 
				+                elif isinstance(event, AgentFinishAction):
			
 
				+                    content = event.outputs.get('content', '')
			
 
				+                elif isinstance(event, MessageAction):
			
 
				+                    content = event.content
			
 
				+                else:
			
 
				+                    logger.warning(f'Unexpected event type: {type(event)}')
			
 
				+                    continue
			
 
				 
			
 
				-            if 'OpenHands is all you need!' in content:
			
 
				-                return TestResult(success=True)
			
 
				+                if 'OpenHands is all you need!' in content:
			
 
				+                    return TestResult(success=True)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f'Error processing event: {e}')
			
 
				+
			
 
				+        logger.debug(
			
 
				+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
			
 
				+        )
			
 
				         return TestResult(
			
 
				             success=False,
			
 
				-            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
			
 
				+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
			
 
				         )
			
--- a/evaluation/integration_tests/tests/t06_github_pr_browsing.py
+++ b/evaluation/integration_tests/tests/t06_github_pr_browsing.py
@@ -14,7 +14,9 @@ class Test(BaseIntegrationTest):
 
				 
			
 
				     @classmethod
			
 
				     def verify_result(cls, runtime: Runtime, histories: list[Event]) -> TestResult:
			
 
				-        # check if the "The answer is OpenHands is all you need!" is in any message
			
 
				+        from openhands.core.logger import openhands_logger as logger
			
 
				+
			
 
				+        # check if the license information is in any message
			
 
				         message_actions = [
			
 
				             event
			
 
				             for event in histories
			
@@ -22,23 +24,33 @@ class Test(BaseIntegrationTest):
 
				                 event, (MessageAction, AgentFinishAction, AgentDelegateObservation)
			
 
				             )
			
 
				         ]
			
 
				+        logger.info(f'Total message-like events: {len(message_actions)}')
			
 
				+
			
 
				         for event in message_actions:
			
 
				-            if isinstance(event, AgentDelegateObservation):
			
 
				-                content = event.content
			
 
				-            elif isinstance(event, AgentFinishAction):
			
 
				-                content = event.outputs.get('content', '')
			
 
				-            elif isinstance(event, MessageAction):
			
 
				-                content = event.content
			
 
				-            else:
			
 
				-                raise ValueError(f'Unknown event type: {type(event)}')
			
 
				-
			
 
				-            if (
			
 
				-                'non-commercial' in content
			
 
				-                or 'MIT' in content
			
 
				-                or 'Apache 2.0' in content
			
 
				-            ):
			
 
				-                return TestResult(success=True)
			
 
				+            try:
			
 
				+                if isinstance(event, AgentDelegateObservation):
			
 
				+                    content = event.content
			
 
				+                elif isinstance(event, AgentFinishAction):
			
 
				+                    content = event.outputs.get('content', '')
			
 
				+                elif isinstance(event, MessageAction):
			
 
				+                    content = event.content
			
 
				+                else:
			
 
				+                    logger.warning(f'Unexpected event type: {type(event)}')
			
 
				+                    continue
			
 
				+
			
 
				+                if (
			
 
				+                    'non-commercial' in content
			
 
				+                    or 'MIT' in content
			
 
				+                    or 'Apache 2.0' in content
			
 
				+                ):
			
 
				+                    return TestResult(success=True)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f'Error processing event: {e}')
			
 
				+
			
 
				+        logger.debug(
			
 
				+            f'Total messages: {len(message_actions)}. Messages: {message_actions}'
			
 
				+        )
			
 
				         return TestResult(
			
 
				             success=False,
			
 
				-            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}. Messages: {message_actions}',
			
 
				+            reason=f'The answer is not found in any message. Total messages: {len(message_actions)}.',
			
 
				         )