1 год назад · 9434bcce48
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -16,6 +16,7 @@ all the preprocessing/evaluation/analysis scripts.
 
				 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
			
 
				 - GAIA: [`evaluation/gaia`](./gaia)
			
 
				 - Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
			
 
				+- MINT: [`evaluation/mint`](./mint)
			
 
				 
			
 
				 ### Result Visualization
			
 
				 
			
--- a/evaluation/mint/.gitignore
+++ b/evaluation/mint/.gitignore
@@ -0,0 +1 @@
 
				+!requirements.txt
			
--- a/evaluation/mint/README.md
+++ b/evaluation/mint/README.md
@@ -0,0 +1,45 @@
 
				+# MINT Benchmark
			
 
				+
			
 
				+This folder contains the evaluation harness for the [MINT benchmark](https://arxiv.org/abs/2309.10691) on LLMs' ability to solve tasks with multi-turn interactions.
			
 
				+
			
 
				+## Configure OpenDevin and LM
			
 
				+
			
 
				+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
			
 
				+
			
 
				+## Start the evaluation
			
 
				+
			
 
				+We are using the MINT dataset hosted on [Hugging Face](https://huggingface.co/datasets/ryanhoangt/xingyaoww-mint-bench).
			
 
				+
			
 
				+Following is the basic command to start the evaluation. Currently, the only agent supported with MINT is `CodeActAgent`.
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/mint/scripts/run_infer.sh [model_config] [subset] [eval_limit]
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, while `subset` and `eval_limit` are optional.
			
 
				+
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your LLM settings, as defined in your `config.toml`.
			
 
				+
			
 
				+- `subset`, e.g. `math`, is the subset of the MINT benchmark to evaluate on, defaulting to `math`.
			
 
				+
			
 
				+- `eval_limit`, e.g. `2`, limits the evaluation to the first `eval_limit` instances, defaulting to all instances.
			
 
				+
			
 
				+Note: in order to use `eval_limit`, you must also set `subset`.
			
 
				+
			
 
				+Let's say you'd like to run 3 instances on the `gsm8k` subset using `eval_gpt4_1106_preview`,
			
 
				+then your command would be:
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/swe_bench/scripts/run_infer.sh eval_gpt4_1106_preview gsm8k 3
			
 
				+```
			
 
				+## Reference
			
 
				+```
			
 
				+@misc{wang2024mint,
			
 
				+    title={MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback},
			
 
				+    author={Xingyao Wang and Zihan Wang and Jiateng Liu and Yangyi Chen and Lifan Yuan and Hao Peng and Heng Ji},
			
 
				+    year={2024},
			
 
				+    eprint={2309.10691},
			
 
				+    archivePrefix={arXiv},
			
 
				+    primaryClass={cs.CL}
			
 
				+}
			
 
				+```
			
--- a/evaluation/mint/config_variables.py
+++ b/evaluation/mint/config_variables.py
@@ -0,0 +1,5 @@
 
				+TASK_INFO_MAP = {
			
 
				+    # === Reasoning ===
			
 
				+    'gsm8k': {'class': 'ReasoningTask', 'type': 'reasoning'},
			
 
				+    'math': {'class': 'ReasoningTask', 'type': 'reasoning'},
			
 
				+}
			
--- a/evaluation/mint/datatypes.py
+++ b/evaluation/mint/datatypes.py
@@ -0,0 +1,82 @@
 
				+import enum
			
 
				+from typing import Any, Dict, Tuple
			
 
				+
			
 
				+
			
 
				+class TaskState:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        finished: bool = False,
			
 
				+        success: bool = False,
			
 
				+        agent_action_count: dict = None,
			
 
				+        terminate_reason: str = None,
			
 
				+        latest_output: Dict[str, Any] = None,
			
 
				+    ):
			
 
				+        self.finished = finished
			
 
				+        self.success = success
			
 
				+        self.agent_action_count: Dict[str, int] = agent_action_count or {
			
 
				+            'propose_solution': 0,
			
 
				+            'use_tool': 0,
			
 
				+            'invalid_action': 0,
			
 
				+        }
			
 
				+        self.terminate_reason = terminate_reason
			
 
				+        self.latest_output = latest_output
			
 
				+
			
 
				+    def to_dict(self) -> Dict[str, Any]:
			
 
				+        return {
			
 
				+            'finished': self.finished,
			
 
				+            'success': self.success,
			
 
				+            'agent_action_count': self.agent_action_count,
			
 
				+            'terminate_reason': self.terminate_reason,
			
 
				+            'latest_output': self.latest_output,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class ParseError(Exception):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				+class FeedbackType(enum.Enum):
			
 
				+    FEEDBACK_WITH_GT = 'feedback_with_gt'
			
 
				+    FEEDBACK_WO_GT = 'feedback_wo_gt'
			
 
				+    NO_FEEDBACK = 'no_feedback'
			
 
				+
			
 
				+
			
 
				+class StepOutput:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        observation: str = None,
			
 
				+        success: bool = False,
			
 
				+        extra: Dict[str, Any] = None,
			
 
				+        turn_info: Tuple[int, int] = None,
			
 
				+    ):
			
 
				+        self.observation: str = observation
			
 
				+        self.success: bool = success
			
 
				+        self.extra: Dict[str, Any] = extra
			
 
				+        self.turn_info = turn_info
			
 
				+
			
 
				+    def __repr__(self) -> str:
			
 
				+        return self.observation
			
 
				+
			
 
				+    def to_str(self) -> str:
			
 
				+        output = 'Observation:\n'
			
 
				+        if self.observation is not None:
			
 
				+            output += self.observation + '\n'
			
 
				+        else:
			
 
				+            if not self.success:
			
 
				+                output += 'Your answer is wrong.\n'
			
 
				+
			
 
				+        if self.turn_info is not None:
			
 
				+            n_steps_left, n_propose_solution_left = self.turn_info
			
 
				+            output += 'You have {} steps left and {} chances to propose solution left.\n'.format(
			
 
				+                n_steps_left, n_propose_solution_left
			
 
				+            )
			
 
				+            if n_steps_left <= 1:
			
 
				+                output += 'You should take the last step to propose a solution.\n'
			
 
				+
			
 
				+        return output
			
 
				+
			
 
				+    def to_dict(self) -> Dict[str, Any]:
			
 
				+        return {
			
 
				+            'observation': self.observation,
			
 
				+            'success': self.success,
			
 
				+        }
			
--- a/evaluation/mint/env.py
+++ b/evaluation/mint/env.py
@@ -0,0 +1,119 @@
 
				+import re
			
 
				+import traceback
			
 
				+from typing import Dict, Optional
			
 
				+
			
 
				+from datatypes import ParseError, StepOutput, TaskState
			
 
				+from task import Task
			
 
				+
			
 
				+from opendevin.controller.state.state import State
			
 
				+
			
 
				+
			
 
				+class SimplifiedEnv:
			
 
				+    INVALID_INPUT_MESSAGE = (
			
 
				+        "I don't understand your input. \n"
			
 
				+        'If you want to execute code, please use <execute_ipython> YOUR_CODE_HERE </execute_ipython>.\n'
			
 
				+        'If you want to give me an answer, please use <solution> YOUR_SOLUTION_HERE </solution>.\n'
			
 
				+        'For example: The answer to the question is <solution> 42 </solution>. \n'
			
 
				+    )
			
 
				+
			
 
				+    def __init__(self, agent_state: State, task: Task, task_config: Dict[str, int]):
			
 
				+        self.agent_state = agent_state
			
 
				+        self.task = task
			
 
				+        self.task_state = TaskState()
			
 
				+        self.task_config = task_config
			
 
				+
			
 
				+    def step(self, lm_message: str):
			
 
				+        observation = self.handle_propose_solution(lm_message)
			
 
				+
			
 
				+        self.check_max_iteration()
			
 
				+
			
 
				+        turn_info = (
			
 
				+            self.task_config['max_iterations'] - self.agent_state.iteration,
			
 
				+            self.task_config['max_propose_solution']
			
 
				+            - self.task_state.agent_action_count['propose_solution'],
			
 
				+        )
			
 
				+
			
 
				+        output = StepOutput(
			
 
				+            observation=observation,
			
 
				+            success=self.task_state.success,
			
 
				+            turn_info=turn_info,
			
 
				+        )
			
 
				+
			
 
				+        self.log_output(output)
			
 
				+        return self.task_state
			
 
				+
			
 
				+    def handle_propose_solution(self, lm_message) -> Optional[str]:
			
 
				+        """Propose answer to check the task success.
			
 
				+
			
 
				+        It might set self.state.finished = True if the task is successful.
			
 
				+        """
			
 
				+        self.task_state.agent_action_count['propose_solution'] += 1
			
 
				+        try:
			
 
				+            parsed = self.parse_propose_solution(lm_message)
			
 
				+            task_success = self.check_task_success(parsed['answer'])
			
 
				+            if task_success:
			
 
				+                self.task_state.finished = True
			
 
				+                self.task_state.success = True
			
 
				+                self.task_state.terminate_reason = 'task_success'
			
 
				+                # NOTE: should not return the function now, because we need to log the output
			
 
				+                # Set state.finished = True will terminate the episode
			
 
				+        except ParseError:
			
 
				+            return SimplifiedEnv.INVALID_INPUT_MESSAGE
			
 
				+        except Exception:
			
 
				+            error_traceback = traceback.format_exc()
			
 
				+            return f'{error_traceback}'
			
 
				+
			
 
				+    def parse_propose_solution(self, lm_message: str) -> dict:
			
 
				+        """Define the parsing logic."""
			
 
				+        lm_output = '\n' + lm_message + '\n'
			
 
				+
			
 
				+        answer = '\n'.join(
			
 
				+            [
			
 
				+                i.strip()
			
 
				+                for i in re.findall(r'<solution>(.*?)</solution>', lm_output, re.DOTALL)
			
 
				+            ]
			
 
				+        )
			
 
				+        if answer == '':
			
 
				+            raise ParseError('No answer found.')
			
 
				+
			
 
				+        return {'answer': answer}
			
 
				+
			
 
				+    def log_output(self, output: StepOutput) -> None:
			
 
				+        if self.task_state.finished:
			
 
				+            return
			
 
				+
			
 
				+        content = output.to_str()
			
 
				+        # self.state.history.append({"role": "user", "content": content})
			
 
				+        self.task_state.latest_output = output.to_dict()
			
 
				+        self.task_state.latest_output['content'] = content
			
 
				+
			
 
				+    def check_task_success(self, answer: str) -> bool:
			
 
				+        # log_message.info(f"STUDENT ANSWER: [{answer}]")
			
 
				+        # log_message.info(f"REFERENCE ANSWER: [{self.task.reference}]")
			
 
				+        return self.task.success(answer)
			
 
				+
			
 
				+    def check_max_iteration(self):
			
 
				+        """Check if the agent has reached the max iteration limit.
			
 
				+
			
 
				+        It might set self.state.finished = True if the agent has reached the max iteration limit.
			
 
				+        """
			
 
				+        if self.task_state.finished:
			
 
				+            # ignore if the episode is already finished (e.g., task success)
			
 
				+            return
			
 
				+
			
 
				+        if (
			
 
				+            # propose solution > max output solution
			
 
				+            self.task_state.agent_action_count['propose_solution']
			
 
				+            >= self.task_config['max_propose_solution']
			
 
				+        ):
			
 
				+            self.task_state.finished = True
			
 
				+            self.task_state.success = False
			
 
				+            self.task_state.terminate_reason = 'max_propose_steps'
			
 
				+        elif (
			
 
				+            # (propose_solution + use_tool) > max iteration limit
			
 
				+            sum(self.task_state.agent_action_count.values())
			
 
				+            >= self.task_config['max_iterations']
			
 
				+        ):
			
 
				+            self.task_state.finished = True
			
 
				+            self.task_state.success = False
			
 
				+            self.task_state.terminate_reason = 'max_iterations'
			
--- a/evaluation/mint/in_context_examples/reasoning/with_tool.txt
+++ b/evaluation/mint/in_context_examples/reasoning/with_tool.txt
@@ -0,0 +1,48 @@
 
				+Task:
			
 
				+Evaluate $\int\int_C (x+y)^2 dx,dy$, where C is the circle defined by $x^2 + y^2 = M$, and M is the birth year of Li Bai.
			
 
				+Output format requirement: Round to three decimal places, using half up rounding rule.
			
 
				+
			
 
				+Assistant:
			
 
				+<thought> To solve this problem, let's start by determining Li Bai's birth year. </thought>
			
 
				+<execute_ipython> print(wikipedia_search('Li Bai')) </execute_ipython>
			
 
				+
			
 
				+Observation:
			
 
				+Page: Li Bai
			
 
				+Summary: Li Bai (Chinese: 李白; pinyin: Lǐ Bái, 701–762), also pronounced as Li Bo, courtesy name Taibai (Chinese: 太白), was a Chinese poet, acclaimed from his own time to the present as a brilliant and romantic figure who took traditional poetic forms to new heights.
			
 
				+
			
 
				+Assistant:
			
 
				+<thought>
			
 
				+From the wiki_search result, we can see that Li Bai was born in 701, thus M = 701.
			
 
				+To evaluate the double integral of $(x+y)^2$ over the region $C$, where $C$ is the circle defined by $x^2 + y^2 = 701$, we should use polar coordinates.
			
 
				+In polar coordinates, $x = r\cos(\theta)$ and $y = r\sin(\theta)$, and the differential area element $dx,dy$ becomes $r,dr,d\theta$.
			
 
				+$\int\int_C (x+y)^2 dx,dy = \int_0^{2\pi}\int_0^{\sqrt{701}} (r\cos(\theta) + r\sin(\theta))^2r,dr,d\theta$
			
 
				+Let's use the SymPy package to calculate this.
			
 
				+</thought>
			
 
				+<execute_ipython>
			
 
				+from sympy import symbols, integrate, cos, sin, pi
			
 
				+
			
 
				+r, theta = symbols('r theta')
			
 
				+
			
 
				+# the integrand
			
 
				+integrand = (r*cos(theta) + r*sin(theta))**2 * r
			
 
				+
			
 
				+# perform the double integration
			
 
				+double_integral = integrate(integrate(integrand, (r, 0, 701 ** 0.5)), (theta, 0, 2*pi))
			
 
				+print(double_integral)
			
 
				+</execute_ipython>
			
 
				+
			
 
				+Observation:
			
 
				+Out[2]: 245700.5*pi
			
 
				+
			
 
				+Assistant:
			
 
				+<thought> Let's round this result to three decimal places using half up rounding rule. </thought>
			
 
				+<execute_ipython>
			
 
				+import math
			
 
				+print(round(245700.5*math.pi, 3))
			
 
				+</execute_ipython>
			
 
				+
			
 
				+Observation:
			
 
				+771890.886
			
 
				+
			
 
				+Assistant:
			
 
				+The answer is <solution> 771890.886 </solution>.
			
--- a/evaluation/mint/prompts/__init__.py
+++ b/evaluation/mint/prompts/__init__.py
@@ -0,0 +1,25 @@
 
				+import os
			
 
				+
			
 
				+from utils import load_file
			
 
				+
			
 
				+PROMPT_DIR = os.path.dirname(__file__)
			
 
				+TEMPLATE_WITH_TOOL = load_file(os.path.join(PROMPT_DIR, 'template_with_tool.txt'))
			
 
				+
			
 
				+
			
 
				+class PromptTemplate:
			
 
				+    """A prompt template."""
			
 
				+
			
 
				+    def __init__(self, template: str):
			
 
				+        self.template: str = template
			
 
				+
			
 
				+    def __call__(self, **kwargs) -> str:
			
 
				+        return self.template.format(**kwargs)
			
 
				+
			
 
				+
			
 
				+class ToolPromptTemplate(PromptTemplate):
			
 
				+    def __init__(self, use_tool: bool):
			
 
				+        if use_tool:
			
 
				+            template = TEMPLATE_WITH_TOOL
			
 
				+        else:
			
 
				+            raise NotImplementedError('Evaluation without tool is not supported yet.')
			
 
				+        super().__init__(template)
			
--- a/evaluation/mint/prompts/template_with_tool.txt
+++ b/evaluation/mint/prompts/template_with_tool.txt
@@ -0,0 +1,19 @@
 
				+You are a helpful assistant assigned with the task of problem-solving.
			
 
				+To solve the task, you can only interact with the interactive Python (Jupyter Notebook) environment using <execute_ipython> tag. Other tools cannot be used.
			
 
				+At each turn, you should first provide your step-by-step thinking for solving the task. Your thought process should be enclosed using "<thought>" tag, for example: <thought> I need to print "Hello World!" </thought>.
			
 
				+
			
 
				+After that, you have two options:
			
 
				+1) Interact with a Python programming environment and receive the corresponding output.
			
 
				+2) Directly provide a solution by sending your answer to user through message that adheres to the required format for the given task. Your solution should be enclosed using "<solution>" tag, for example: The answer is <solution> A </solution>.
			
 
				+Either you choose to interact with the Python environment or provide a solution, you need to send a message to the user to evaluate your response and provide feedback.
			
 
				+
			
 
				+You have {max_total_steps} chances to interact with the environment or propose a solution. You can only propose a solution {max_propose_solution} times.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+{in_context_example}
			
 
				+
			
 
				+---
			
 
				+
			
 
				+# Problem statement:
			
 
				+{task_prompt}
			
--- a/evaluation/mint/requirements.txt
+++ b/evaluation/mint/requirements.txt
@@ -0,0 +1,32 @@
 
				+pre-commit
			
 
				+openai
			
 
				+datasets
			
 
				+backoff
			
 
				+charset-normalizer==3.1.0
			
 
				+# Alfworld
			
 
				+pandas==1.4.4
			
 
				+opencv-python
			
 
				+networkx
			
 
				+tqdm
			
 
				+vocab
			
 
				+revtok
			
 
				+Click
			
 
				+ai2thor==2.1.0
			
 
				+transformers
			
 
				+tokenizers
			
 
				+scipy==1.10.1
			
 
				+ipython
			
 
				+matplotlib
			
 
				+cython
			
 
				+nltk
			
 
				+gym==0.15.4
			
 
				+pipreqs
			
 
				+pyyaml
			
 
				+pytz
			
 
				+visdom
			
 
				+sympy
			
 
				+pycocotools
			
 
				+seaborn
			
 
				+google-generativeai
			
 
				+python-dateutil
			
 
				+statsmodels
			
--- a/evaluation/mint/run_infer.py
+++ b/evaluation/mint/run_infer.py
@@ -0,0 +1,357 @@
 
				+import asyncio
			
 
				+import functools
			
 
				+import json
			
 
				+import logging
			
 
				+import multiprocessing as mp
			
 
				+import os
			
 
				+import pathlib
			
 
				+import subprocess
			
 
				+import time
			
 
				+from concurrent.futures import ProcessPoolExecutor
			
 
				+from typing import Dict
			
 
				+
			
 
				+from datasets import load_dataset
			
 
				+from datatypes import TaskState
			
 
				+from env import SimplifiedEnv
			
 
				+from prompts import ToolPromptTemplate
			
 
				+from task import ReasoningTask, Task
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from evaluation.swe_bench.swe_env_box import DockerSSHBox
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.config import config, get_llm_config_arg, get_parser
			
 
				+from opendevin.core.logger import get_console_handler
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.main import main
			
 
				+from opendevin.events.serialization.event import event_to_dict
			
 
				+
			
 
				+
			
 
				+def cleanup():
			
 
				+    print('Cleaning up child processes...')
			
 
				+    for process in mp.active_children():
			
 
				+        print(f'Terminating child process: {process.name}')
			
 
				+        process.terminate()
			
 
				+        process.join()
			
 
				+
			
 
				+
			
 
				+def codeact_user_response(state: State, task: Task, task_config: Dict[str, int]):
			
 
				+    logger.info(f'Gold reference: {task.reference}')
			
 
				+    logger.info(f'Task config: {task_config}')
			
 
				+
			
 
				+    env = SimplifiedEnv(
			
 
				+        agent_state=state,
			
 
				+        task=task,
			
 
				+        task_config=task_config,
			
 
				+    )
			
 
				+    last_action, _ = state.history[-1]
			
 
				+    result_state: TaskState = env.step(last_action.message)
			
 
				+    state.task_state = result_state
			
 
				+
			
 
				+    if not result_state.latest_output:
			
 
				+        if result_state.success:
			
 
				+            msg = 'Your answer is correct. Please EXIT using the following command: <execute_bash> exit </execute_bash>.'
			
 
				+        else:
			
 
				+            msg = 'Something went wrong! No output from the model.'
			
 
				+    else:
			
 
				+        msg = result_state.latest_output['content']
			
 
				+
			
 
				+    logger.info('User response:' + msg)
			
 
				+    return msg
			
 
				+
			
 
				+
			
 
				+def monologue_user_response(state: State) -> str:
			
 
				+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
			
 
				+
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+    'MonologueAgent': monologue_user_response,
			
 
				+}
			
 
				+
			
 
				+AGENT_CLS_TO_INST_SUFFIX = {
			
 
				+    'CodeActAgent': '\nIMPORTANT: When your answer is confirmed by the user to be correct, you can exit using the following command: <execute_bash> exit </execute_bash>.\n'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def process_instance(
			
 
				+    instance: Task,
			
 
				+    agent_class,
			
 
				+    metadata,
			
 
				+    skip_workspace_mount,
			
 
				+    eval_output_dir,
			
 
				+    reset_logger: bool = True,
			
 
				+):
			
 
				+    workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
			
 
				+    # create process-specific workspace dir
			
 
				+    # if `not skip_workspace_mount` - we will create a workspace directory for EACH process
			
 
				+    # so that different agent don't interfere with each other.
			
 
				+    if not skip_workspace_mount:
			
 
				+        workspace_mount_path = os.path.join(workspace_mount_path, str(os.getpid()))
			
 
				+        pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
			
 
				+    if reset_logger:
			
 
				+        # Set up logger
			
 
				+        log_file = os.path.join(
			
 
				+            eval_output_dir, 'logs', f'instance_{instance.task_id}.log'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        # add back the console handler to print ONE line
			
 
				+        logger.addHandler(get_console_handler())
			
 
				+        logger.info(
			
 
				+            f'Starting evaluation for instance {instance.task_id}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(
			
 
				+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        )
			
 
				+        logger.addHandler(file_handler)
			
 
				+
			
 
				+    if not skip_workspace_mount:
			
 
				+        logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
			
 
				+
			
 
				+    sandbox = DockerSSHBox()
			
 
				+
			
 
				+    requirements_host_src = 'evaluation/mint/requirements.txt'
			
 
				+    requirements_sandbox_dest = '/opendevin/plugins/mint/requirements.txt'
			
 
				+    sandbox.copy_to(
			
 
				+        host_src=requirements_host_src,
			
 
				+        sandbox_dest=requirements_sandbox_dest,
			
 
				+        recursive=False,
			
 
				+    )
			
 
				+    logger.info(
			
 
				+        f'Copied files from [{requirements_host_src}] to [{requirements_sandbox_dest}] inside sandbox.'
			
 
				+    )
			
 
				+    exit_code, output = sandbox.execute(f'pip install -r {requirements_sandbox_dest}')
			
 
				+
			
 
				+    # Prepare instruction
			
 
				+    instruction = ToolPromptTemplate(use_tool=True)(
			
 
				+        max_total_steps=metadata['max_iterations'],
			
 
				+        max_propose_solution=metadata['max_propose_solution'],
			
 
				+        in_context_example=instance.in_context_example(
			
 
				+            use_tool=True, with_feedback=False
			
 
				+        ),
			
 
				+        task_prompt='Task:\n' + instance.prompt,
			
 
				+    )
			
 
				+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you or provide the solution inside <solution> tag AND NEVER ASK FOR HUMAN HELP.\n'
			
 
				+
			
 
				+    # NOTE: You can actually set slightly different instruction for different agents
			
 
				+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
			
 
				+
			
 
				+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+    fake_user_response_fn = functools.partial(
			
 
				+        AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
			
 
				+        task=instance,
			
 
				+        task_config={
			
 
				+            'max_iterations': metadata['max_iterations'],
			
 
				+            'max_propose_solution': metadata['max_propose_solution'],
			
 
				+        },
			
 
				+    )
			
 
				+
			
 
				+    state: State = asyncio.run(
			
 
				+        main(
			
 
				+            instruction,
			
 
				+            fake_user_response_fn=fake_user_response_fn,
			
 
				+            sandbox=sandbox,
			
 
				+        )
			
 
				+    )
			
 
				+
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+
			
 
				+    logger.info('Msgs: ' + str(state.history))
			
 
				+
			
 
				+    task_state: TaskState = state.task_state
			
 
				+    logger.info('Task state: ' + str(task_state.to_dict()))
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = {
			
 
				+        'id': instance.task_id,
			
 
				+        'instance': instance.to_dict(),
			
 
				+        'instruction': instruction,
			
 
				+        'metadata': metadata,
			
 
				+        'history': [
			
 
				+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				+        ],
			
 
				+        'error': state.error if state and state.error else None,
			
 
				+        'test_result': task_state.success,
			
 
				+    }
			
 
				+
			
 
				+    # Close the sandbox
			
 
				+    sandbox.close()
			
 
				+
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+
			
 
				+    parser.add_argument(
			
 
				+        '--subset',
			
 
				+        default='math',
			
 
				+        choices=['math', 'gsm8k'],
			
 
				+        type=str,
			
 
				+        help='subset of the dataset to be used',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--max-propose-solution',
			
 
				+        default=2,
			
 
				+        type=int,
			
 
				+        help='maximum number of times the agent can propose a solution',
			
 
				+    )
			
 
				+
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+
			
 
				+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				+    # so we don't need to manage file uploading to OpenDevin's repo
			
 
				+    mint_dataset = load_dataset(
			
 
				+        'ryanhoangt/xingyaoww-mint-bench', name=args.subset, split='test'
			
 
				+    )
			
 
				+    logger.info(f'Evaluating MINT - {args.subset} subset')
			
 
				+
			
 
				+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				+    # for details of how to set `llm_config`
			
 
				+    if args.llm_config:
			
 
				+        specified_llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        if specified_llm_config:
			
 
				+            config.llm = specified_llm_config
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+
			
 
				+    # TEST METADATA
			
 
				+    agent_class = args.agent_cls
			
 
				+    assert (
			
 
				+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
			
 
				+    ), f'Unsupported agent class: {agent_class}'
			
 
				+    model_name = config.llm.model.split('/')[-1]
			
 
				+    max_iterations = args.max_iterations
			
 
				+    eval_note = ''
			
 
				+    if args.eval_note is not None:
			
 
				+        eval_note += '_N_' + args.eval_note
			
 
				+    eval_output_dir = os.path.join(
			
 
				+        args.eval_output_dir,
			
 
				+        'mint',
			
 
				+        agent_class,
			
 
				+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				+        args.subset,
			
 
				+    )
			
 
				+
			
 
				+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
			
 
				+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
			
 
				+        parents=True, exist_ok=True
			
 
				+    )
			
 
				+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
			
 
				+
			
 
				+    metadata = {
			
 
				+        'agent_class': agent_class,
			
 
				+        'model_name': model_name,
			
 
				+        'max_iterations': max_iterations,
			
 
				+        'max_propose_solution': args.max_propose_solution,
			
 
				+        'eval_output_dir': eval_output_dir,
			
 
				+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        # get the commit id of current repo for reproduciblity
			
 
				+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
			
 
				+        .decode('utf-8')
			
 
				+        .strip(),
			
 
				+    }
			
 
				+    logger.info(f'Metadata: {metadata}')
			
 
				+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
			
 
				+        json.dump(metadata, f)
			
 
				+
			
 
				+    # LIMIT EVALUATION
			
 
				+    eval_n_limit = args.eval_n_limit
			
 
				+    if eval_n_limit:
			
 
				+        mint_dataset = mint_dataset.select(range(eval_n_limit))
			
 
				+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
			
 
				+
			
 
				+    # OUTPUT FILE
			
 
				+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
			
 
				+    logger.info(f'Writing evaluation output to {output_file}')
			
 
				+    finished_instance_ids = set()
			
 
				+    if os.path.exists(output_file):
			
 
				+        with open(output_file, 'r') as f:
			
 
				+            for line in f:
			
 
				+                data = json.loads(line)
			
 
				+                finished_instance_ids.add(data['id'])
			
 
				+        logger.warning(
			
 
				+            f'Output file {output_file} already exists. Loaded {len(finished_instance_ids)} finished instances.'
			
 
				+        )
			
 
				+    output_fp = open(output_file, 'a')
			
 
				+
			
 
				+    logger.info(
			
 
				+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}, max propose solution {args.max_propose_solution}.'
			
 
				+    )
			
 
				+
			
 
				+    # =============================================
			
 
				+    # filter out finished instances
			
 
				+    task_class = ReasoningTask
			
 
				+    new_mint_tests: list[ReasoningTask] = []
			
 
				+    for instance in mint_dataset:
			
 
				+        if instance['id'] in finished_instance_ids:
			
 
				+            logger.info(
			
 
				+                f'Skipping instance {instance["id"]} as it is already finished.'
			
 
				+            )
			
 
				+            continue
			
 
				+        # convert to Task object
			
 
				+        instance = ReasoningTask(**instance)
			
 
				+        new_mint_tests.append(instance)
			
 
				+
			
 
				+    mint_dataset = new_mint_tests
			
 
				+    logger.info(
			
 
				+        f'Finished instances: {len(finished_instance_ids)}, Remaining instances: {len(mint_dataset)}'
			
 
				+    )
			
 
				+    # =============================================
			
 
				+
			
 
				+    pbar = tqdm(total=len(mint_dataset))
			
 
				+
			
 
				+    # This function tracks the progress AND write the output to a JSONL file
			
 
				+    def update_progress(future):
			
 
				+        pbar.update(1)
			
 
				+        output = future.result()
			
 
				+        # logger.info('Output: ', output)
			
 
				+        # pbar.set_description(f'Instance {output["instance_id"]}')
			
 
				+        # pbar.set_postfix_str(f'Test Result: {output["test_result"]["result"]}')
			
 
				+        # logger.info(
			
 
				+        #     f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]["result"]}'
			
 
				+        # )
			
 
				+        output_fp.write(json.dumps(output) + '\n')
			
 
				+        output_fp.flush()
			
 
				+
			
 
				+    # This sets the multi-processing
			
 
				+    num_workers = args.eval_num_workers
			
 
				+    logger.info(f'Using {num_workers} workers for evaluation.')
			
 
				+
			
 
				+    # This is SWE-Bench specific - CodeActAgent doesn't require mounted workspace to work
			
 
				+    skip_workspace_mount = agent_class == 'CodeActAgent'
			
 
				+    logger.info(f'Skipping workspace mount: {skip_workspace_mount}')
			
 
				+
			
 
				+    try:
			
 
				+        with ProcessPoolExecutor(num_workers) as executor:
			
 
				+            futures = []
			
 
				+            # This is how we perform multi-processing
			
 
				+            for instance in mint_dataset:
			
 
				+                future = executor.submit(
			
 
				+                    process_instance,
			
 
				+                    instance,
			
 
				+                    agent_class,
			
 
				+                    metadata,
			
 
				+                    skip_workspace_mount,
			
 
				+                    eval_output_dir,
			
 
				+                    reset_logger=bool(num_workers > 1),
			
 
				+                )
			
 
				+                future.add_done_callback(update_progress)
			
 
				+                futures.append(future)
			
 
				+
			
 
				+            # Wait for all futures to complete
			
 
				+            for future in futures:
			
 
				+                future.result()
			
 
				+    except KeyboardInterrupt:
			
 
				+        print('KeyboardInterrupt received. Cleaning up...')
			
 
				+        cleanup()
			
 
				+
			
 
				+    output_fp.close()
			
 
				+    logger.info('Evaluation finished.')
			
--- a/evaluation/mint/scripts/run_infer.sh
+++ b/evaluation/mint/scripts/run_infer.sh
@@ -0,0 +1,37 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+MODEL_CONFIG=$1
			
 
				+SUBSET=$2
			
 
				+EVAL_LIMIT=$3
			
 
				+# Only 'CodeActAgent' is supported for MINT now
			
 
				+AGENT="CodeActAgent"
			
 
				+
			
 
				+# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+
			
 
				+export PYTHONPATH=$(pwd)
			
 
				+
			
 
				+COMMAND="poetry run python ./evaluation/mint/run_infer.py \
			
 
				+    --max-iterations 5 \
			
 
				+    --max-propose-solution 2 \
			
 
				+    --eval-note $AGENT_VERSION"
			
 
				+
			
 
				+if [ -n "$SUBSET" ]; then
			
 
				+  echo "SUBSET: $SUBSET"
			
 
				+  COMMAND="$COMMAND --subset $SUBSET"
			
 
				+# otherwise default to use the math subset
			
 
				+else
			
 
				+  echo "SUBSET: math"
			
 
				+  COMMAND="$COMMAND --subset math"
			
 
				+fi
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/evaluation/mint/task.py
+++ b/evaluation/mint/task.py
@@ -0,0 +1,121 @@
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+from abc import ABC, abstractmethod
			
 
				+from typing import List, Optional, Tuple
			
 
				+
			
 
				+from utils import load_file
			
 
				+
			
 
				+LOGGER = logging.getLogger('MINT')
			
 
				+
			
 
				+
			
 
				+class Task(ABC):
			
 
				+    """Base class for a task instance."""
			
 
				+
			
 
				+    task_name: str = 'base'
			
 
				+    in_context_example_dir = os.path.join(
			
 
				+        os.path.dirname(os.path.abspath(__file__)),
			
 
				+        'in_context_examples',
			
 
				+    )
			
 
				+
			
 
				+    def __init__(self, **kwargs) -> None:
			
 
				+        if 'loaded_history' in kwargs:
			
 
				+            self.loaded_history = kwargs['loaded_history']
			
 
				+        else:
			
 
				+            self.loaded_history = None
			
 
				+        # pre-load the in-context example
			
 
				+        task_dir = os.path.join(self.in_context_example_dir, self.task_name)
			
 
				+        self._in_context_example = {
			
 
				+            'with_tool': load_file(os.path.join(task_dir, 'with_tool.txt')),
			
 
				+        }
			
 
				+        self.metadata = {}
			
 
				+
			
 
				+    @property
			
 
				+    def task_id(self) -> str:
			
 
				+        """Return the task id."""
			
 
				+        assert hasattr(self, '_id'), 'Task does not have an id.'
			
 
				+        return self._id
			
 
				+
			
 
				+    def in_context_example(
			
 
				+        self, use_tool: bool = True, with_feedback: bool = False
			
 
				+    ) -> str:
			
 
				+        """Return the in-context example for the task."""
			
 
				+        if use_tool and not with_feedback:
			
 
				+            return self._in_context_example['with_tool']
			
 
				+        else:
			
 
				+            raise NotImplementedError
			
 
				+
			
 
				+    @property
			
 
				+    def prompt(self) -> str:
			
 
				+        """Return the task prompt."""
			
 
				+        assert hasattr(self, '_prompt'), 'Task does not have a prompt.'
			
 
				+        return self._prompt
			
 
				+
			
 
				+    @property
			
 
				+    def reference(self) -> str:
			
 
				+        """Return the reference solution for the task."""
			
 
				+        assert hasattr(self, '_reference'), 'Task does not have a reference solution.'
			
 
				+        return self._reference
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def extract_answer(self, solution: str) -> Optional[str]:
			
 
				+        """Extract the answer from the given solution."""
			
 
				+        pass
			
 
				+
			
 
				+    @abstractmethod
			
 
				+    def success(self, solution: str) -> bool:
			
 
				+        """This checks whether the given solution can complete the current task.
			
 
				+
			
 
				+        Can be used to provide binary feedback.
			
 
				+        """
			
 
				+        answer = self.extract_answer(solution)
			
 
				+        return answer == self.reference
			
 
				+
			
 
				+    @classmethod
			
 
				+    def load_tasks(cls, path: str) -> Tuple[List['Task'], int]:
			
 
				+        """Load all the tasks from a given jsonl file."""
			
 
				+        assert path.endswith('.jsonl') or path.endswith('.json')
			
 
				+        with open(path, 'r') as f:
			
 
				+            tasks = [cls(**json.loads(line)) for line in f.readlines()]
			
 
				+        LOGGER.info(f'Loaded {len(tasks)} tasks from {path}')
			
 
				+        return tasks, len(tasks)
			
 
				+
			
 
				+    def to_dict(self) -> dict:
			
 
				+        """Convert the task to a dictionary."""
			
 
				+        return {
			
 
				+            'task_name': self.task_name,
			
 
				+            'task_id': self.task_id,
			
 
				+            'prompt': self.prompt,
			
 
				+            'reference': self.reference,
			
 
				+            'metadata': self.metadata,
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+class ReasoningTask(Task):
			
 
				+    task_name = 'reasoning'
			
 
				+
			
 
				+    def __init__(self, id: str, prompt: str, reference: str, **kwargs):
			
 
				+        super().__init__(**kwargs)
			
 
				+        self._id = id
			
 
				+        self._prompt = prompt.strip()
			
 
				+        self._reference = str(reference).strip().lower()
			
 
				+
			
 
				+    def extract_answer(self, solution: str) -> Optional[str]:
			
 
				+        """Extract the answer from the given solution."""
			
 
				+        return solution.lower().strip()
			
 
				+
			
 
				+    def compare_w_digits(self, reference: str, answer: str) -> bool:
			
 
				+        """Compare the reference and answer with digits."""
			
 
				+        # if reference can and answer can both be converted to floats by float()
			
 
				+        try:
			
 
				+            float(reference)
			
 
				+            float(answer)
			
 
				+            return abs(float(reference) - float(answer)) <= 0.05 * abs(float(reference))
			
 
				+        except ValueError:
			
 
				+            return reference in answer
			
 
				+        except Exception:
			
 
				+            raise ValueError(f'Cannot compare {reference} and {answer}')
			
 
				+
			
 
				+    def success(self, solution: str) -> bool:
			
 
				+        answer = self.extract_answer(solution)
			
 
				+        return self.compare_w_digits(self._reference, answer)
			
--- a/evaluation/mint/utils.py
+++ b/evaluation/mint/utils.py
@@ -0,0 +1,10 @@
 
				+import functools
			
 
				+
			
 
				+
			
 
				+# use cache to avoid loading the same file multiple times
			
 
				+# which can leads to too many open files error
			
 
				+@functools.lru_cache(maxsize=128)
			
 
				+def load_file(filepath: str) -> str:
			
 
				+    with open(filepath, 'r') as f:
			
 
				+        content = f.read()
			
 
				+    return content