1 an în urmă · 7fc57650f3
--- a/evaluation/biocoder/README.md
+++ b/evaluation/biocoder/README.md
@@ -0,0 +1,59 @@
 
				+# BioCoder Evaluation with Opendevin
			
 
				+
			
 
				+Implements evaluation of agents on BioCoder from the BioCoder benchmark introduced in [BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models](https://arxiv.org/abs/2308.16458). Please see [here](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/bigcode_eval/tasks/humanevalpack.py) for the reference implementation used in the paper.
			
 
				+
			
 
				+## Setup Environment
			
 
				+
			
 
				+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local develop environment for OpenDevin.
			
 
				+
			
 
				+
			
 
				+## Configure OpenDevin and your LLM
			
 
				+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
			
 
				+
			
 
				+## BioCoder Docker Image
			
 
				+In the opendevin branch of the Biocoder repository, we have slightly modified our original Docker image to work with the OpenDevin environment. In the Docker image are testing scripts (`/testing/start_test_opendevin.py` and aux files in `/testing_files/`) to assist with evaluation. Additionally, we have installed all dependencies, including OpenJDK, mamba (with Python 3.6), and many system libraries. Notably, we have **not** packaged all repositories into the image, so they are downloaded at runtime.
			
 
				+
			
 
				+**Before first execution, pull our Docker image with the following command**
			
 
				+```bash
			
 
				+docker pull public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0
			
 
				+```
			
 
				+
			
 
				+To reproduce this image, please see the Dockerfile_Opendevin in the `biocoder` repository.
			
 
				+
			
 
				+## Start the evaluation
			
 
				+
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/biocoder/scripts/run_infer.sh [model_config] [agent] [eval_limit]
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
			
 
				+
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+
			
 
				+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+
			
 
				+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
			
 
				+
			
 
				+Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
			
 
				+then your command would be:
			
 
				+
			
 
				+## Examples
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/biocoder/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent 1
			
 
				+```
			
 
				+
			
 
				+## Reference
			
 
				+```
			
 
				+@misc{tang2024biocoder,
			
 
				+      title={BioCoder: A Benchmark for Bioinformatics Code Generation with Large Language Models},
			
 
				+      author={Xiangru Tang and Bill Qian and Rick Gao and Jiakang Chen and Xinyun Chen and Mark Gerstein},
			
 
				+      year={2024},
			
 
				+      eprint={2308.16458},
			
 
				+      archivePrefix={arXiv},
			
 
				+      primaryClass={cs.LG}
			
 
				+}
			
 
				+```
			
--- a/evaluation/biocoder/biocoder_env_box.py
+++ b/evaluation/biocoder/biocoder_env_box.py
@@ -0,0 +1,396 @@
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+import sys
			
 
				+from collections import defaultdict
			
 
				+from dataclasses import dataclass
			
 
				+
			
 
				+from datasets import load_dataset
			
 
				+
			
 
				+from opendevin.core.config import config
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.runtime.docker.ssh_box import DockerSSHBox
			
 
				+from opendevin.runtime.plugins import (
			
 
				+    JupyterRequirement,
			
 
				+    PluginRequirement,
			
 
				+    SWEAgentCommandsRequirement,
			
 
				+)
			
 
				+
			
 
				+BIOCODER_BENCH_CONTAINER_IMAGE = 'public.ecr.aws/i5g0m1f6/eval_biocoder:v1.0'
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class BiocoderData:
			
 
				+    filePath: str
			
 
				+    numLines: int
			
 
				+    lineStart: int
			
 
				+    lineEnd: int
			
 
				+    signature: str
			
 
				+    comment: str
			
 
				+    content: str
			
 
				+    repository: str
			
 
				+    promptSummaryOnly: str
			
 
				+    contextCode: str
			
 
				+    goldenCode: str
			
 
				+    test_case_id: str
			
 
				+    language: str
			
 
				+
			
 
				+    def to_dict(self):
			
 
				+        return {
			
 
				+            'filePath': self.filePath,
			
 
				+            'numLines': self.numLines,
			
 
				+            'lineStart': self.lineStart,
			
 
				+            'lineEnd': self.lineEnd,
			
 
				+            'signature': self.signature,
			
 
				+            'comment': self.comment,
			
 
				+            'content': self.content,
			
 
				+            'repository': self.repository,
			
 
				+            'promptSummaryOnly': self.promptSummaryOnly,
			
 
				+            'contextCode': self.contextCode,
			
 
				+            'goldenCode': self.goldenCode,
			
 
				+            'test_case_id': self.test_case_id,
			
 
				+            'language': self.language,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+def get_likely_indent_size(array_of_tabs) -> int:
			
 
				+    sizes = defaultdict(int)
			
 
				+
			
 
				+    for i in range(len(array_of_tabs) - 1):
			
 
				+        diff = array_of_tabs[i + 1] - array_of_tabs[i]
			
 
				+        if diff > 0:
			
 
				+            sizes[diff] += 1
			
 
				+    if len(sizes) == 0:
			
 
				+        return 4
			
 
				+    return int(max(sizes, key=sizes.get))
			
 
				+
			
 
				+
			
 
				+class BiocoderSSHBox(DockerSSHBox):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        container_image: str,
			
 
				+        timeout: int = 120,
			
 
				+        sid: str | None = None,
			
 
				+        biocoder_instance_id: str | None = None,
			
 
				+        biocoder_instance: BiocoderData | None = None,
			
 
				+        skip_workspace_mount: bool = True,
			
 
				+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
			
 
				+        biocoder_cache_folder: str = 'biocoder_cache',
			
 
				+        workspace_dir_name: str | None = None,
			
 
				+    ):
			
 
				+        if biocoder_instance_id is None:
			
 
				+            raise ValueError('biocoder_instance_id must be provided')
			
 
				+        self.biocoder_instance_id = biocoder_instance_id
			
 
				+        self.biocoder_instance = biocoder_instance
			
 
				+        self.skip_workspace_mount = skip_workspace_mount
			
 
				+        self.biocoder_cache_folder = biocoder_cache_folder
			
 
				+        self.first_line_after_removed = None
			
 
				+        self.workspace_dir_name = workspace_dir_name
			
 
				+        self.workspace_base = config.workspace_base
			
 
				+        self.workspace_mount_path = config.workspace_mount_path
			
 
				+        # self.workspace_dir_name_host = os.path.join(config.workspace_base, workspace_dir_name)
			
 
				+
			
 
				+        self.context_path = None
			
 
				+        self.generated_path = None
			
 
				+        self.golden_path = None
			
 
				+
			
 
				+        assert (
			
 
				+            container_image is not None
			
 
				+        ), 'container_image is required for BiocoderBenchSSHBox!'
			
 
				+        super().__init__(container_image, timeout, sid)
			
 
				+        self.init_plugins(sandbox_plugins)
			
 
				+
			
 
				+    @property
			
 
				+    def volumes(self):
			
 
				+        if self.skip_workspace_mount:
			
 
				+            return {
			
 
				+                k: v
			
 
				+                for k, v in super().volumes.items()
			
 
				+                if not v['bind'] == self.sandbox_workspace_dir
			
 
				+            }
			
 
				+        return super().volumes
			
 
				+
			
 
				+    def get_target_filepath(self):
			
 
				+        target_filepath = os.path.join(
			
 
				+            self.workspace_mount_path,
			
 
				+            self.biocoder_instance.repository.split('/')[1],
			
 
				+            self.biocoder_instance.filePath,
			
 
				+        )
			
 
				+        return target_filepath
			
 
				+
			
 
				+    def get_changed_code(self, include_signature=False):
			
 
				+        # copies changed code into /testing_files/
			
 
				+        # Note that this does NOT copy the function signature
			
 
				+        target_filepath = self.get_target_filepath()
			
 
				+        selected_lines = []
			
 
				+        offset = 1 if include_signature else 0
			
 
				+        if self.first_line_after_removed is None:
			
 
				+            logger.warning('First line after removed is None')
			
 
				+        with open(target_filepath, 'r') as f:
			
 
				+            lines = f.read().split('\n')
			
 
				+            for i in range(self.biocoder_instance.lineStart - offset, len(lines)):
			
 
				+                if lines[i].strip() == self.first_line_after_removed.strip():
			
 
				+                    break
			
 
				+                selected_lines.append(lines[i])
			
 
				+        text = '\n'.join(selected_lines)
			
 
				+        return text
			
 
				+
			
 
				+    def copy_changed_code(self):
			
 
				+        changed_code = self.get_changed_code(include_signature=True)
			
 
				+        with open(self.generated_path, 'w') as f:
			
 
				+            f.write(changed_code)
			
 
				+        exit_code, output = self.execute_and_check(
			
 
				+            f'cp -r /workspace/{self.biocoder_cache_folder}/* /testing_files',
			
 
				+            'Failed to copy the files',
			
 
				+        )
			
 
				+
			
 
				+    def remove_code(self):
			
 
				+        comment_prefix = {'python': '#', 'java': '//'}
			
 
				+
			
 
				+        target_filepath = self.get_target_filepath()
			
 
				+        line_start = self.biocoder_instance.lineStart
			
 
				+        line_end = self.biocoder_instance.lineEnd
			
 
				+        with open(target_filepath, 'r') as f:
			
 
				+            lines = f.read().split('\n')
			
 
				+            # print("="*10+"ORIGINAL"+"="*10)
			
 
				+            # print("\n".join(lines))
			
 
				+            signature_line = lines[line_start - 1]
			
 
				+
			
 
				+            # get the number of tabs
			
 
				+            def get_indent_size(s: str):
			
 
				+                return len(re.match(r'\s*', s).group())
			
 
				+
			
 
				+            indent_sizes = list(map(get_indent_size, lines))
			
 
				+            indent_size = get_likely_indent_size(indent_sizes)
			
 
				+            comment_indent_size = get_indent_size(signature_line) + indent_size
			
 
				+            lines = (
			
 
				+                lines[:line_start]
			
 
				+                + [
			
 
				+                    f"{' '*comment_indent_size+comment_prefix[self.biocoder_instance.language.lower()]}TODO: replace with your code here"
			
 
				+                ]
			
 
				+                + ([''] * 2)
			
 
				+                + lines[line_end:]
			
 
				+            )
			
 
				+        first_line_after_removed_index = line_start
			
 
				+        while len(
			
 
				+            lines[first_line_after_removed_index].strip()
			
 
				+        ) == 0 and first_line_after_removed_index < len(lines):
			
 
				+            first_line_after_removed_index += 1
			
 
				+        self.first_line_after_removed = lines[first_line_after_removed_index]
			
 
				+        # print("FIRST LINE AFTER REMOVED: ", self.first_line_after_removed)
			
 
				+
			
 
				+        with open(target_filepath, 'w') as f:
			
 
				+            f.write('\n'.join(lines))
			
 
				+
			
 
				+        # with open(target_filepath, 'r') as f:
			
 
				+        #     print("="*10+"MODIFIED"+"="*10)
			
 
				+        #     print(f.read())
			
 
				+
			
 
				+    def execute_and_check(self, cmd: str, error_msg: str) -> tuple[int, str]:
			
 
				+        exit_code, output = self.execute(cmd)
			
 
				+        if exit_code != 0:
			
 
				+            logger.error(error_msg)
			
 
				+            sys.exit(1)
			
 
				+        return exit_code, output
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_box_for_instance(
			
 
				+        cls,
			
 
				+        instance,
			
 
				+        workspace_dir_name=None,
			
 
				+        skip_workspace_mount: bool = False,
			
 
				+        workspace_mount_path: str | None = None,
			
 
				+        sandbox_plugins: list[PluginRequirement] = [],  # noqa: B006
			
 
				+    ) -> 'BiocoderSSHBox':
			
 
				+        """This method initializes a container image, then runs some initialization commands"""
			
 
				+        if workspace_dir_name is None:
			
 
				+            workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
			
 
				+                '/', '__'
			
 
				+            )
			
 
				+
			
 
				+        workspace_base = str(os.path.join(config.workspace_base, workspace_dir_name))
			
 
				+        old_workspace_base = config.workspace_base
			
 
				+        old_workspace_mount_path = config.workspace_mount_path
			
 
				+
			
 
				+        try:
			
 
				+            config.workspace_base = workspace_base
			
 
				+            config.workspace_mount_path = workspace_base
			
 
				+
			
 
				+            # linting python after editing helps LLM fix indentations
			
 
				+            config.enable_auto_lint = True
			
 
				+
			
 
				+            # create folder for transferring files back/forth
			
 
				+            biocoder_cache_folder = 'biocoder_cache'
			
 
				+            if not os.path.exists(os.path.join(workspace_base, biocoder_cache_folder)):
			
 
				+                os.makedirs(
			
 
				+                    os.path.join(workspace_base, biocoder_cache_folder), exist_ok=True
			
 
				+                )
			
 
				+
			
 
				+            file_ext = {
			
 
				+                'python': 'py',
			
 
				+                'java': 'java',
			
 
				+                'c': 'c',
			
 
				+                'cpp': 'cpp',
			
 
				+                'javascript': 'js',
			
 
				+                'typescript': 'ts',
			
 
				+            }[instance.language.lower()]
			
 
				+
			
 
				+            context_path = os.path.join(
			
 
				+                workspace_base, biocoder_cache_folder, 'context.' + file_ext
			
 
				+            )
			
 
				+            generated_path = os.path.join(
			
 
				+                workspace_base, biocoder_cache_folder, 'generated.' + file_ext
			
 
				+            )
			
 
				+            golden_path = os.path.join(
			
 
				+                workspace_base, biocoder_cache_folder, 'golden.' + file_ext
			
 
				+            )
			
 
				+
			
 
				+            # print(instance.contextCode)
			
 
				+            with open(context_path, 'w') as f:
			
 
				+                f.write(instance.contextCode)
			
 
				+            with open(generated_path, 'w') as f:
			
 
				+                f.write(instance.goldenCode)
			
 
				+            with open(golden_path, 'w') as f:
			
 
				+                f.write(instance.goldenCode)
			
 
				+
			
 
				+            testcase_json = {
			
 
				+                'test_case_id': instance.test_case_id,
			
 
				+                'num_cases': 1000,
			
 
				+                'language': instance.language.lower(),
			
 
				+            }
			
 
				+
			
 
				+            with open(
			
 
				+                os.path.join(
			
 
				+                    workspace_base, biocoder_cache_folder, 'testcase_biocoder.json'
			
 
				+                ),
			
 
				+                'w',
			
 
				+            ) as f:
			
 
				+                f.write(json.dumps(testcase_json, indent=4))
			
 
				+
			
 
				+            # linting python after editing helps LLM fix indentations
			
 
				+            config.enable_auto_lint = True
			
 
				+
			
 
				+            sandbox = cls(
			
 
				+                container_image=BIOCODER_BENCH_CONTAINER_IMAGE,
			
 
				+                biocoder_instance_id=instance.test_case_id,
			
 
				+                biocoder_instance=instance,
			
 
				+                skip_workspace_mount=skip_workspace_mount,
			
 
				+                sandbox_plugins=sandbox_plugins,
			
 
				+                biocoder_cache_folder=biocoder_cache_folder,
			
 
				+                workspace_dir_name=workspace_dir_name,
			
 
				+            )
			
 
				+        except Exception:
			
 
				+            raise
			
 
				+        finally:
			
 
				+            config.workspace_base = old_workspace_base
			
 
				+            config.workspace_mount_path = old_workspace_mount_path
			
 
				+
			
 
				+        sandbox.context_path = context_path
			
 
				+        sandbox.generated_path = generated_path
			
 
				+        sandbox.golden_path = golden_path
			
 
				+
			
 
				+        logger.info(f'SSH box started for instance {instance.test_case_id}.')
			
 
				+        # cd to the workspace
			
 
				+        exit_code, output = sandbox.execute_and_check(
			
 
				+            'cd /workspace', 'Failed to cd to workspace'
			
 
				+        )
			
 
				+        logger.info(f'cd to workspace: {output}')
			
 
				+
			
 
				+        # download repository archive
			
 
				+        repository_url = f"https://biocoder.lilbillbiscuit.com/repos/{instance.repository.split('/')[1]}.zip"
			
 
				+        exit_code, output = sandbox.execute_and_check(
			
 
				+            'wget -O repo.zip ' + repository_url, 'Failed to download the repository'
			
 
				+        )
			
 
				+        logger.info(f'Downloaded the repository: {output}')
			
 
				+        exit_code, output = sandbox.execute_and_check(
			
 
				+            'unzip -o -q repo.zip', 'Failed to unzip the repository'
			
 
				+        )
			
 
				+        logger.info(f'Unzipped the repository: {output}')
			
 
				+
			
 
				+        # copy the context, generated and golden files to the /testing_files folder
			
 
				+        exit_code, output = sandbox.execute_and_check(
			
 
				+            f'cp -r /workspace/{biocoder_cache_folder}/* /testing_files',
			
 
				+            'Failed to copy the files',
			
 
				+        )
			
 
				+
			
 
				+        # chmod 777
			
 
				+        exit_code, output = sandbox.execute_and_check(
			
 
				+            'chmod -R 777 /workspace',
			
 
				+            'Failed to chmod the files',
			
 
				+        )
			
 
				+
			
 
				+        return sandbox
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    biocoder_dataset = load_dataset('Lilbillbiscuit/biocoder_public')
			
 
				+    EXAMPLE_INSTANCE = biocoder_dataset['test'][0]
			
 
				+    EXAMPLE_INSTANCE = BiocoderData(**EXAMPLE_INSTANCE)
			
 
				+
			
 
				+    sandbox = BiocoderSSHBox.get_box_for_instance(
			
 
				+        instance=EXAMPLE_INSTANCE,
			
 
				+        workspace_mount_path='/home/ubuntu/OpenDevinBioCoder/workspace',
			
 
				+        skip_workspace_mount=False,
			
 
				+        sandbox_plugins=[JupyterRequirement(), SWEAgentCommandsRequirement()],
			
 
				+    )
			
 
				+
			
 
				+    # PRE TEST
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'cd /testing',
			
 
				+        'Failed to cd /testing',
			
 
				+    )
			
 
				+    logger.info(f'cd $REPO_PATH: {output}')
			
 
				+
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'whoami',
			
 
				+        'Failed to run whoami',
			
 
				+    )
			
 
				+    logger.info(f'whoami: {output}')
			
 
				+
			
 
				+    # TEST
			
 
				+    exit_code, output = sandbox.execute(
			
 
				+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
			
 
				+    )
			
 
				+    assert exit_code == 0, 'Expected exit code 0 (this should have passed)'
			
 
				+    logger.info(f'$TEST_CMD:\n{output}')
			
 
				+
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
			
 
				+    )
			
 
				+
			
 
				+    print(output)
			
 
				+    json_obj = json.loads(output)
			
 
				+    if json_obj['result'] == 'pass':
			
 
				+        print('PASS')
			
 
				+    else:
			
 
				+        print('FAIL')
			
 
				+
			
 
				+    bg_cmd = sandbox.execute_in_background(
			
 
				+        "while true; do echo 'dot ' && sleep 10; done"
			
 
				+    )
			
 
				+
			
 
				+    sys.stdout.flush()
			
 
				+    try:
			
 
				+        while True:
			
 
				+            try:
			
 
				+                user_input = input('>>> ')
			
 
				+            except EOFError:
			
 
				+                logger.info('Exiting...')
			
 
				+                break
			
 
				+            if user_input.lower() == 'exit':
			
 
				+                logger.info('Exiting...')
			
 
				+                break
			
 
				+            if user_input.lower() == 'kill':
			
 
				+                sandbox.kill_background(bg_cmd.pid)
			
 
				+                logger.info('Background process killed')
			
 
				+                continue
			
 
				+            exit_code, output = sandbox.execute(user_input)
			
 
				+            logger.info('exit code: %d', exit_code)
			
 
				+            logger.info(output)
			
 
				+            if bg_cmd.pid in sandbox.background_commands:
			
 
				+                logs = sandbox.read_logs(bg_cmd.pid)
			
 
				+                logger.info('background logs: %s', logs)
			
 
				+            sys.stdout.flush()
			
 
				+    except KeyboardInterrupt:
			
 
				+        logger.info('Exiting...')
			
 
				+    sandbox.close()
			
--- a/evaluation/biocoder/run_infer.py
+++ b/evaluation/biocoder/run_infer.py
@@ -0,0 +1,393 @@
 
				+import asyncio
			
 
				+import json
			
 
				+import logging
			
 
				+import multiprocessing as mp
			
 
				+import os
			
 
				+import pathlib
			
 
				+import subprocess
			
 
				+import time
			
 
				+from concurrent.futures import ProcessPoolExecutor
			
 
				+
			
 
				+import pandas as pd
			
 
				+from datasets import load_dataset
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+import agenthub
			
 
				+from evaluation.biocoder.biocoder_env_box import BiocoderData, BiocoderSSHBox
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.config import args, config, get_llm_config_arg
			
 
				+from opendevin.core.logger import get_console_handler
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.main import main
			
 
				+from opendevin.events.action import MessageAction
			
 
				+from opendevin.events.serialization.event import event_to_dict
			
 
				+
			
 
				+
			
 
				+def cleanup():
			
 
				+    print('Cleaning up child processes...')
			
 
				+    for process in mp.active_children():
			
 
				+        print(f'Terminating child process: {process.name}')
			
 
				+        process.terminate()
			
 
				+        process.join()
			
 
				+
			
 
				+
			
 
				+def codeact_user_response(state: State) -> str:
			
 
				+    msg = (
			
 
				+        'Please continue working on the task on whatever approach you think is suitable.\n'
			
 
				+        'If you think you have modified the code in a way that fixes the issue, please run the following command: <execute_bash> exit </execute_bash>.\n'
			
 
				+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
			
 
				+    )
			
 
				+    if state.history:
			
 
				+        user_msgs = [
			
 
				+            action
			
 
				+            for action, _ in state.history
			
 
				+            if isinstance(action, MessageAction) and action.source == 'user'
			
 
				+        ]
			
 
				+        if len(user_msgs) >= 2:
			
 
				+            # let the agent know that it can give up when it has tried 3 times
			
 
				+            return (
			
 
				+                msg
			
 
				+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
			
 
				+            )
			
 
				+    return msg
			
 
				+
			
 
				+
			
 
				+def monologue_user_response(state: State) -> str:
			
 
				+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
			
 
				+
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+    'MonologueAgent': monologue_user_response,
			
 
				+}
			
 
				+
			
 
				+AGENT_CLS_TO_INST_SUFFIX = {
			
 
				+    'CodeActAgent': 'When you think you have fixed the issue through code changes, please run the following command: <execute_bash> exit </execute_bash>.\n'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def get_test_result(instance, sandbox, workspace_dir_name):
			
 
				+    test_result = {'result': {}, 'metadata': {}}
			
 
				+    try:
			
 
				+        code = sandbox.get_changed_code(include_signature=True)
			
 
				+        sandbox.copy_changed_code()
			
 
				+        test_result['metadata']['1_copy_change_success'] = True
			
 
				+        test_result['metadata']['1_copy_change_code'] = code
			
 
				+    except Exception:
			
 
				+        logger.error('Error fetching changed code for this instance')
			
 
				+        test_result['metadata']['1_copy_change_success'] = False
			
 
				+        test_result['metadata']['1_copy_change_code'] = None
			
 
				+
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'cd /testing',
			
 
				+        'Failed to cd /testing',
			
 
				+    )
			
 
				+    logger.info(f'cd $REPO_PATH: {output}')
			
 
				+
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'whoami',
			
 
				+        'Failed to run whoami',
			
 
				+    )
			
 
				+    logger.info(f'whoami: {output}')
			
 
				+
			
 
				+    exit_code, output = sandbox.execute(
			
 
				+        '/home/devin/mambaforge/bin/mamba run -n test python3 /testing/start_test_opendevin.py'
			
 
				+    )
			
 
				+    logger.info(f'$TEST_CMD:\n{output}')
			
 
				+
			
 
				+    exit_code, output = sandbox.execute_and_check(
			
 
				+        'cat /testing_files/results_biocoder.json', 'Failed to read the result file'
			
 
				+    )
			
 
				+    if exit_code == 0:
			
 
				+        test_result['metadata']['2_run_test_success'] = True
			
 
				+        test_result['metadata']['2_run_test_result'] = str(output)
			
 
				+    else:
			
 
				+        test_result['metadata']['2_run_test_success'] = False
			
 
				+        test_result['metadata']['2_run_test_result'] = str(output)
			
 
				+    json_obj = json.loads(output)
			
 
				+    test_result['result'] = json_obj['result']
			
 
				+
			
 
				+    return test_result
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance,
			
 
				+    agent_class,
			
 
				+    metadata,
			
 
				+    skip_workspace_mount,
			
 
				+    eval_output_dir,
			
 
				+    reset_logger: bool = True,
			
 
				+):
			
 
				+    instance = BiocoderData(**instance)
			
 
				+    print(instance)
			
 
				+    workspace_dir_name = (
			
 
				+        f'{instance.repository}__{instance.test_case_id[:10]}__{os.getpid()}'.replace(
			
 
				+            '/', '__'
			
 
				+        )
			
 
				+    )
			
 
				+    workspace_mount_path = os.path.join(config.workspace_base, workspace_dir_name)
			
 
				+    # create process-specific workspace dir
			
 
				+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
			
 
				+    # so that different agent don't interfere with each other.
			
 
				+    if not skip_workspace_mount:
			
 
				+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
			
 
				+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
			
 
				+    if reset_logger:
			
 
				+        # Set up logger
			
 
				+        log_file = os.path.join(
			
 
				+            eval_output_dir, 'logs', f'instance_{instance.test_case_id}.log'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        # add back the console handler to print ONE line
			
 
				+        logger.addHandler(get_console_handler())
			
 
				+        logger.info(
			
 
				+            f'Starting evaluation for instance {instance.test_case_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(
			
 
				+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        )
			
 
				+        logger.addHandler(file_handler)
			
 
				+
			
 
				+    if not skip_workspace_mount:
			
 
				+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
			
 
				+
			
 
				+    # NOTE: this is something special we do for SWE-Bench due to the reason described in the previous section
			
 
				+    # You can omit this if you don't need to setup specialized sandbox
			
 
				+    workspace_dir_name = f'{instance.repository}__{instance.test_case_id[:10]}'.replace(
			
 
				+        '/', '__'
			
 
				+    )
			
 
				+    sandbox = BiocoderSSHBox.get_box_for_instance(
			
 
				+        instance,
			
 
				+        workspace_dir_name,
			
 
				+        skip_workspace_mount=False,
			
 
				+        workspace_mount_path=workspace_mount_path,
			
 
				+        sandbox_plugins=agenthub.Agent.get_cls(agent_class).sandbox_plugins,
			
 
				+    )
			
 
				+
			
 
				+    sandbox.remove_code()
			
 
				+
			
 
				+    # Prepare instruction
			
 
				+    instruction = (
			
 
				+        f'Please complete the function "{instance.signature}" in the file /workspace/{instance.repository.split("/")[1]}/{instance.filePath}.\n'
			
 
				+        f'The environment has been set up for you to start working. You may assume all necessary tools are installed.\n'
			
 
				+        f'To complete the task, you must directly modify the file and fill in the function, keeping in mind that the function signature is on line {instance.lineStart-1}\n\n'
			
 
				+        f'The function should do the following:\n'
			
 
				+        f'{instance.promptSummaryOnly}\n\n'
			
 
				+    )
			
 
				+
			
 
				+    instruction += (
			
 
				+        'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
			
 
				+        'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
			
 
				+        'You may need context from other files in the repository to complete this task.'
			
 
				+        'Do NOT add any import statements or change anything else other than the writing the function body.\n'
			
 
				+        'You do not need to run the code to check if it works. \n'
			
 
				+        'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
			
 
				+    )
			
 
				+
			
 
				+    # instruction = (
			
 
				+    #     f'In the file {instance.filePath}, there is a function with a signature and without a body. Your job is to complete the function, according to the given instructions. When you complete the function, respond with the function body, and nothing else.'
			
 
				+    #     'The repository has cloned for you to start working. You are not allowed to run any bash commands, just modify the files. \n\n'
			
 
				+    #     '# Problem Statement\n'
			
 
				+    #     'Complete the following function signature:\n\n'
			
 
				+    #     f'{instance.signature}'
			
 
				+    #     'The function should do the following:\n\n'
			
 
				+    #     f'{instance.promptSummaryOnly}\n\n'
			
 
				+    # )
			
 
				+    #
			
 
				+    # instruction += (
			
 
				+    #     'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
			
 
				+    #     'You should NOT modify any other files other than the file intended. This means that you should NOT write any test cases.\n'
			
 
				+    #     'Do NOT add any import statements or change anything else other than the writing the function body.\n'
			
 
				+    #     'You do not need to run the code to check if it works. The system will automatically check the correctness of your code.\n'
			
 
				+    #     'Make sure to include proper formatting in Java and Python, including correct braces and/or indentation.\n'
			
 
				+    # )
			
 
				+
			
 
				+    # NOTE: You can actually set slightly different instruction for different agents
			
 
				+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
			
 
				+
			
 
				+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+    state: State = asyncio.run(
			
 
				+        main(
			
 
				+            instruction,
			
 
				+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
			
 
				+            sandbox=sandbox,
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				+    test_result = get_test_result(instance, sandbox, workspace_dir_name)
			
 
				+
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = {
			
 
				+        'test_case_id': instance.test_case_id,
			
 
				+        'biocoder_instance': instance.to_dict(),
			
 
				+        'instruction': instruction,
			
 
				+        'generated': test_result['metadata']['1_copy_change_code'],
			
 
				+        'metadata': metadata,
			
 
				+        'history': [
			
 
				+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				+        ],
			
 
				+        'metrics': metrics,
			
 
				+        'error': state.error if state and state.error else None,
			
 
				+        'test_result': test_result,
			
 
				+    }
			
 
				+
			
 
				+    # Close the sandbox
			
 
				+    sandbox.close()
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				+    # so we don't need to manage file uploading to OpenDevin's repo
			
 
				+    dataset = load_dataset('lilbillbiscuit/biocoder_public')
			
 
				+    biocoder_tests = dataset['test'].to_pandas()
			
 
				+
			
 
				+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				+    # for details of how to set `llm_config`
			
 
				+    if args.llm_config:
			
 
				+        specified_llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        if specified_llm_config:
			
 
				+            config.llm = specified_llm_config
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+
			
 
				+    # TEST METADATA
			
 
				+    agent_class = args.agent_cls
			
 
				+    assert (
			
 
				+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
			
 
				+    ), f'Unsupported agent class: {agent_class}'
			
 
				+    model_name = config.llm.model.split('/')[-1]
			
 
				+    max_iterations = args.max_iterations
			
 
				+    eval_note = ''
			
 
				+    if args.eval_note is not None:
			
 
				+        eval_note += '_N_' + args.eval_note
			
 
				+    eval_output_dir = os.path.join(
			
 
				+        args.eval_output_dir,
			
 
				+        'biocoder',
			
 
				+        agent_class,
			
 
				+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				+    )
			
 
				+
			
 
				+    eval_output_dir = str(eval_output_dir)
			
 
				+
			
 
				+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
			
 
				+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
			
 
				+        parents=True, exist_ok=True
			
 
				+    )
			
 
				+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
			
 
				+
			
 
				+    metadata = {
			
 
				+        'agent_class': agent_class,
			
 
				+        'model_name': model_name,
			
 
				+        'max_iterations': max_iterations,
			
 
				+        'eval_output_dir': eval_output_dir,
			
 
				+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        # get the commit id of current repo for reproduciblity
			
 
				+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
			
 
				+        .decode('utf-8')
			
 
				+        .strip(),
			
 
				+    }
			
 
				+    logger.info(f'Metadata: {metadata}')
			
 
				+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
			
 
				+        json.dump(metadata, f)
			
 
				+
			
 
				+    # LIMIT EVALUATION
			
 
				+    eval_n_limit = args.eval_n_limit
			
 
				+    if eval_n_limit:
			
 
				+        biocoder_tests = biocoder_tests.head(eval_n_limit)
			
 
				+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
			
 
				+
			
 
				+    # OUTPUT FILE
			
 
				+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
			
 
				+    logger.info(f'Writing evaluation output to {output_file}')
			
 
				+    finished_test_case_ids = set()
			
 
				+    if os.path.exists(output_file):
			
 
				+        with open(output_file, 'r') as f:
			
 
				+            for line in f:
			
 
				+                data = json.loads(line)
			
 
				+                finished_test_case_ids.add(data['test_case_id'])
			
 
				+        logger.warning(
			
 
				+            f'Output file {output_file} already exists. Loaded {len(finished_test_case_ids)} finished instances.'
			
 
				+        )
			
 
				+    output_fp = open(output_file, 'a')
			
 
				+
			
 
				+    logger.info(
			
 
				+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
			
 
				+    )
			
 
				+
			
 
				+    # =============================================
			
 
				+    # filter out finished instances
			
 
				+    new_biocoder_tests = []
			
 
				+    for idx, instance in biocoder_tests.iterrows():
			
 
				+        if instance.test_case_id in finished_test_case_ids:
			
 
				+            logger.info(
			
 
				+                f'Skipping instance {instance.test_case_id} as it is already finished.'
			
 
				+            )
			
 
				+            continue
			
 
				+        new_biocoder_tests.append(instance)
			
 
				+
			
 
				+    biocoder_tests = pd.DataFrame(new_biocoder_tests)
			
 
				+    logger.info(
			
 
				+        f'Finished instances: {len(finished_test_case_ids)}, Remaining instances: {len(biocoder_tests)}'
			
 
				+    )
			
 
				+    # =============================================
			
 
				+
			
 
				+    pbar = tqdm(total=len(biocoder_tests))
			
 
				+
			
 
				+    # This function tracks the progress AND write the output to a JSONL file
			
 
				+    def update_progress(future):
			
 
				+        pbar.update(1)
			
 
				+        output = future.result()
			
 
				+        pbar.set_description(f'Instance {output["test_case_id"]}')
			
 
				+        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
			
 
				+        logger.info(
			
 
				+            f'Finished evaluation for instance {output["test_case_id"]}: {output["test_result"]}'
			
 
				+        )
			
 
				+        output_fp.write(json.dumps(output) + '\n')
			
 
				+        output_fp.flush()
			
 
				+
			
 
				+    # This sets the multi-processing
			
 
				+    num_workers = args.eval_num_workers
			
 
				+    logger.info(f'Using {num_workers} workers for evaluation.')
			
 
				+
			
 
				+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
			
 
				+    skip_workspace_mount = agent_class == 'CodeActAgent'
			
 
				+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
			
 
				+
			
 
				+    try:
			
 
				+        with ProcessPoolExecutor(num_workers) as executor:
			
 
				+            futures = []
			
 
				+            # This is how we perform multi-processing
			
 
				+            for row_idx, instance in biocoder_tests.iterrows():
			
 
				+                future = executor.submit(
			
 
				+                    process_instance,
			
 
				+                    instance,
			
 
				+                    agent_class,
			
 
				+                    metadata,
			
 
				+                    skip_workspace_mount,
			
 
				+                    eval_output_dir,
			
 
				+                    reset_logger=bool(num_workers > 1),
			
 
				+                )
			
 
				+                future.add_done_callback(update_progress)
			
 
				+                futures.append(future)
			
 
				+
			
 
				+            # Wait for all futures to complete
			
 
				+            for future in futures:
			
 
				+                future.result()
			
 
				+    except KeyboardInterrupt:
			
 
				+        print('KeyboardInterrupt received. Cleaning up...')
			
 
				+        cleanup()
			
 
				+
			
 
				+    output_fp.close()
			
 
				+    logger.info('Evaluation finished.')
			
--- a/evaluation/biocoder/scripts/run_infer.sh
+++ b/evaluation/biocoder/scripts/run_infer.sh
@@ -0,0 +1,37 @@
 
				+#!/bin/bash
			
 
				+MODEL_CONFIG=$1
			
 
				+AGENT=$2
			
 
				+EVAL_LIMIT=$3
			
 
				+DATASET="biocoder"
			
 
				+
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				+# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+echo "DATASET: $DATASET"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/biocoder/run_infer.py \
			
 
				+  --agent-cls $AGENT \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --max-iterations 10 \
			
 
				+  --max-chars 10000000 \
			
 
				+  --eval-num-workers 1 \
			
 
				+  --eval-note ${AGENT_VERSION}_${DATASET}"
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+echo $COMMAND
			
 
				+eval $COMMAND
			
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -471,7 +471,7 @@ if __name__ == '__main__':
 
				     def update_progress(future):
			
 
				         pbar.update(1)
			
 
				         output = future.result()
			
 
				-        pbar.set_description(f'Instance {output["instance_id"]}')
			
 
				+        pbar.set_description(f'Instance {output["instance_id"][:10]}')
			
 
				         pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
			
 
				         logger.info(
			
 
				             f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'