1 год назад · 0c829cd067
--- a/evaluation/EDA/README.md
+++ b/evaluation/EDA/README.md
@@ -0,0 +1,44 @@
 
				+# EDA Evaluation
			
 
				+
			
 
				+This folder contains evaluation harness for evaluating agents on the Entity-deduction-Arena Benchmark, from the paper [Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games](https://arxiv.org/abs/2310.01468), presented in ACL 2024 main conference.
			
 
				+
			
 
				+## Configure OpenDevin and your LLM
			
 
				+
			
 
				+Create a `config.toml` file if it does not exist at the root of the workspace. Please check [README.md](../../README.md) for how to set this up.
			
 
				+
			
 
				+## Start the evaluation
			
 
				+
			
 
				+
			
 
				+```bash
			
 
				+export OPENAI_API_KEY="sk-XXX"; # This is required for evaluation (to simulate another party of conversation)
			
 
				+./evaluation/EDA/scripts/run_infer.sh [model_config] [agent] [dataset] [eval_limit]
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, while `agent`, `dataset` and `eval_limit` are optional.
			
 
				+
			
 
				+- `model_config`, e.g. `eval_gpt4_1106_preview`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+
			
 
				+- `agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+
			
 
				+- `dataset`: There are two tasks in this evaluation. Specify `dataset` to test on either `things` or `celebs` task.
			
 
				+
			
 
				+- `eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances. By default it infers all instances.
			
 
				+
			
 
				+Let's say you'd like to run 10 instances using `eval_gpt4_1106_eval_gpt4o_2024_05_13preview` and CodeActAgent,
			
 
				+then your command would be:
			
 
				+
			
 
				+```bash
			
 
				+./evaluation/EDA/scripts/run_infer.sh eval_gpt4o_2024_05_13 CodeActAgent things
			
 
				+```
			
 
				+
			
 
				+## Reference
			
 
				+```
			
 
				+@inproceedings{zhang2023entity,
			
 
				+  title={Probing the Multi-turn Planning Capabilities of LLMs via 20 Question Games},
			
 
				+  author={Zhang, Yizhe and Lu, Jiarui and Jaitly, Navdeep},
			
 
				+  journal={ACL},
			
 
				+  year={2024}
			
 
				+}
			
 
				+```
			
--- a/evaluation/EDA/game.py
+++ b/evaluation/EDA/game.py
@@ -0,0 +1,413 @@
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+from typing import Optional
			
 
				+
			
 
				+import openai
			
 
				+import requests.exceptions
			
 
				+import torch
			
 
				+from openai import OpenAI
			
 
				+from retry import retry
			
 
				+from transformers import AutoModelForCausalLM, AutoTokenizer
			
 
				+
			
 
				+LOGGER = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def load_model(path):
			
 
				+    print('Loading model...')
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(path, use_fast=False)
			
 
				+    print('Tokenizer loaded.')
			
 
				+    model = AutoModelForCausalLM.from_pretrained(
			
 
				+        path, low_cpu_mem_usage=True, torch_dtype=torch.float16
			
 
				+    ).cuda()
			
 
				+    print('Model loaded.')
			
 
				+    # model.half().cuda()
			
 
				+    return model, tokenizer
			
 
				+
			
 
				+
			
 
				+class Q20Game:
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        item: str,
			
 
				+        answerer_model: str = 'gpt-3.5-turbo-0613',
			
 
				+        guesser_model: str = 'gpt-3.5-turbo-0613',
			
 
				+        num_turns: int = 20,
			
 
				+        temperature: float = 0.8,
			
 
				+        openai_api: bool = True,
			
 
				+        openai_api_key: Optional[str] = None,
			
 
				+        guesser_kargs={},
			
 
				+    ) -> None:
			
 
				+        self.item = item
			
 
				+        self.answerer_model = answerer_model
			
 
				+        self.guesser_model = guesser_model
			
 
				+        self.num_turns = num_turns
			
 
				+        self.temperature = temperature
			
 
				+        self.openai_api = openai_api
			
 
				+        self.guesser_kargs = guesser_kargs
			
 
				+        self.vicuna_prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
			
 
				+        self.first_user_utterance = (
			
 
				+            'Your task is to ask a series of questions to deduce the entity '
			
 
				+            "that I'm thinking of with as few queries as possible. "
			
 
				+            "Only ask questions that can be answered by 'yes', 'no' or 'maybe'. "
			
 
				+            'Do not ask for hint. Make your question brief with no linebreaker. '
			
 
				+            'Now start asking a question.'
			
 
				+        )
			
 
				+        self.guesser_win = False
			
 
				+        self.curr_turn = 0
			
 
				+        if openai_api_key is not None:
			
 
				+            openai.api_key = openai_api_key
			
 
				+
			
 
				+        if isinstance(answerer_model, str) and not answerer_model.startswith('gpt'):
			
 
				+            self.user_api_base = 'http://0.0.0.0:8000/v1'
			
 
				+        else:
			
 
				+            self.user_api_base = 'https://api.openai.com/v1'
			
 
				+
			
 
				+        if isinstance(guesser_model, str) and not guesser_model.startswith('gpt'):
			
 
				+            self.guesser_api_base = 'http://0.0.0.0:8000/v1'
			
 
				+        else:
			
 
				+            self.guesser_api_base = 'https://api.openai.com/v1'
			
 
				+
			
 
				+        self.guesser_messages = []
			
 
				+
			
 
				+    def confusion_matrix(self, path):
			
 
				+        self.reset()
			
 
				+        with open(path) as f:
			
 
				+            raw_messages = json.load(f)
			
 
				+            self.item = path.split('/')[-1].split('_')[0]
			
 
				+            roles = ['assistant', 'user']
			
 
				+            for i, message in enumerate(raw_messages):
			
 
				+                self.guesser_messages.append(
			
 
				+                    {'role': roles[i % 2], 'content': message['content']}
			
 
				+                )
			
 
				+
			
 
				+        self.guesser_messages = self.guesser_messages[:-2]
			
 
				+        self.guesser_messages[-1]['content'] = (
			
 
				+            self.guesser_messages[-1]['content'] + " You must guess now, what's it?"
			
 
				+        )
			
 
				+        guesser_msg = self.guesser(self.guesser_messages)
			
 
				+        self.guesser_messages.append(guesser_msg)
			
 
				+        guesser_question = guesser_msg['content'].strip()
			
 
				+        self.guesser_messages[-1]['content'] = (
			
 
				+            self.guesser_messages[-1]['content'] + ' Is it right?'
			
 
				+        )
			
 
				+        usr_msg = self.answerer(guesser_question)
			
 
				+        self.guesser_messages.append(
			
 
				+            {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
			
 
				+        )
			
 
				+
			
 
				+        if 'bingo' in self.guesser_messages[-1]['content'].lower():
			
 
				+            self.guesser_win = True
			
 
				+            return True
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    @retry(
			
 
				+        (
			
 
				+            openai.Timeout,
			
 
				+            requests.exceptions.ReadTimeout,
			
 
				+            openai.RateLimitError,
			
 
				+            openai.APIError,
			
 
				+            requests.exceptions.HTTPError,
			
 
				+            openai.APIConnectionError,
			
 
				+        ),
			
 
				+        tries=5,
			
 
				+        delay=0.5,
			
 
				+        backoff=0.5,
			
 
				+        max_delay=2,
			
 
				+        logger=LOGGER,
			
 
				+    )
			
 
				+    def guesser(self, messages):
			
 
				+        if not self.guesser_model.startswith('gpt'):  # hf model
			
 
				+            self.guesser_model, self.guesser_tokenizer = load_model(self.guesser_model)
			
 
				+
			
 
				+            # """Wraps hf's `generate` adding some specific method's defaults"""
			
 
				+            assert not self.openai_api
			
 
				+            prompt = self.dialog_history() + ' ASSISTANT:'
			
 
				+            input_ids = torch.tensor(
			
 
				+                [self.guesser_tokenizer.encode(prompt, add_special_tokens=True)]
			
 
				+            )  # TODO check if huggingface is using the same format.
			
 
				+            input_ids = input_ids.to(self.guesser_model.base_model.device)
			
 
				+            attention_mask = None
			
 
				+
			
 
				+            with torch.no_grad():
			
 
				+                gen = self.guesser_model.generate(
			
 
				+                    input_ids=input_ids,
			
 
				+                    attention_mask=attention_mask,
			
 
				+                    **self.guesser_kargs,
			
 
				+                )
			
 
				+                gen_str = (
			
 
				+                    self.guesser_tokenizer.decode(gen[0][input_ids[0].shape[0] :])
			
 
				+                    .split('</s>')[0]
			
 
				+                    .split('USER')[0]
			
 
				+                    .lstrip()
			
 
				+                    .strip()
			
 
				+                )
			
 
				+
			
 
				+                return {
			
 
				+                    'role': 'assistant',
			
 
				+                    'content': gen_str,
			
 
				+                }
			
 
				+        else:
			
 
				+            openai.api_base = self.guesser_api_base
			
 
				+            client = OpenAI(api_key=openai.api_key)
			
 
				+            response = client.chat.completions.create(
			
 
				+                model=self.guesser_model,
			
 
				+                messages=messages,
			
 
				+                max_tokens=64,
			
 
				+                n=1,
			
 
				+                stop=None,
			
 
				+                temperature=self.temperature,
			
 
				+            )
			
 
				+            return {
			
 
				+                'role': 'assistant',
			
 
				+                'content': response.choices[0].message.to_dict()['content'].strip(),
			
 
				+            }
			
 
				+
			
 
				+    def dialog_history(self):
			
 
				+        history = self.vicuna_prompt + ' '
			
 
				+        for item in self.guesser_messages:
			
 
				+            if item['role'].upper() == 'USER':
			
 
				+                history += 'USER: ' + item['content']
			
 
				+            elif item['role'].upper() == 'ASSISTANT':
			
 
				+                history += ' ' + 'ASSISTANT: ' + item['content'] + '</s>'
			
 
				+        return history
			
 
				+
			
 
				+
			
 
				+    def preprocess_response(self,response):
			
 
				+        response = re.sub(
			
 
				+            r'the entity you are thinking of', 'it', response
			
 
				+        )
			
 
				+        response = re.sub(
			
 
				+            r"the entity you're thinking of", 'it', response
			
 
				+        )
			
 
				+        response = re.sub(
			
 
				+            r" you're thinking of", '', response
			
 
				+        )
			
 
				+        response = re.sub(
			
 
				+            r' you are thinking of', '', response
			
 
				+        )
			
 
				+        self.guesser_messages.append(response)
			
 
				+        return response
			
 
				+
			
 
				+    def judge_winner(self, response):
			
 
				+        guesser_question = response.strip()
			
 
				+
			
 
				+        if self.curr_turn == self.num_turns - 1:
			
 
				+            guesser_question += ' Is it right?'
			
 
				+        # ask for answer
			
 
				+        usr_msg = self.answerer(guesser_question)
			
 
				+
			
 
				+        if 'bingo' in usr_msg['content'].lower():
			
 
				+            self.guesser_win = True
			
 
				+            return True, ""
			
 
				+        
			
 
				+        return False, usr_msg['content'].strip()
			
 
				+    
			
 
				+    def generate_user_response(self, response):
			
 
				+        response = self.preprocess_response(response)
			
 
				+        # others
			
 
				+        bingo, anwser_reply = self.judge_winner(response)
			
 
				+        if bingo:
			
 
				+            return "You are bingo! quit now, run: <execute_bash> exit </execute_bash>.\n"
			
 
				+        if self.curr_turn == self.num_turns - 2:
			
 
				+            anwser_reply += " You must guess now, what's it?"
			
 
				+        return anwser_reply
			
 
				+
			
 
				+    def game_play(self, user_mode=False):
			
 
				+        self.reset()
			
 
				+        # print(f"Item: {self.item}")
			
 
				+        for t in range(self.num_turns):
			
 
				+            # System asking a question
			
 
				+            if (not user_mode) or user_mode is None:
			
 
				+                guesser_msg = self.guesser(self.guesser_messages)
			
 
				+                guesser_msg['content'] = re.sub(
			
 
				+                    r'the entity you are thinking of', 'it', guesser_msg['content']
			
 
				+                )
			
 
				+                guesser_msg['content'] = re.sub(
			
 
				+                    r"the entity you're thinking of", 'it', guesser_msg['content']
			
 
				+                )
			
 
				+                guesser_msg['content'] = re.sub(
			
 
				+                    r" you're thinking of", '', guesser_msg['content']
			
 
				+                )
			
 
				+                guesser_msg['content'] = re.sub(
			
 
				+                    r' you are thinking of', '', guesser_msg['content']
			
 
				+                )
			
 
				+            else:
			
 
				+                user_q = input(
			
 
				+                    f'Type in your questions for turn {t+1}. (e.g. Is it a living thing?)\n'
			
 
				+                )
			
 
				+                guesser_msg = {'role': 'assistant', 'content': user_q}
			
 
				+            self.guesser_messages.append(guesser_msg)
			
 
				+            guesser_question = guesser_msg['content'].strip()
			
 
				+
			
 
				+            if t == self.num_turns - 1:
			
 
				+                self.guesser_messages[-1]['content'] = (
			
 
				+                    self.guesser_messages[-1]['content'] + ' Is it right?'
			
 
				+                )
			
 
				+
			
 
				+            usr_msg = self.answerer(guesser_question)
			
 
				+            self.guesser_messages.append(
			
 
				+                {'role': 'user', 'content': f"{usr_msg['content'].strip()}"}
			
 
				+            )
			
 
				+
			
 
				+            if 'bingo' in usr_msg['content'].lower():
			
 
				+                self.guesser_win = True
			
 
				+                return True
			
 
				+
			
 
				+            if t == self.num_turns - 2:
			
 
				+                self.guesser_messages[-1]['content'] = (
			
 
				+                    self.guesser_messages[-1]['content']
			
 
				+                    + " You must guess now, what's it?"
			
 
				+                )
			
 
				+
			
 
				+        return False
			
 
				+
			
 
				+    def save_session(self, path):
			
 
				+        # Print the conversation
			
 
				+        if not os.path.exists(path):
			
 
				+            os.makedirs(path)
			
 
				+        output_file = os.path.join(path, f'{self.item}.txt')
			
 
				+        with open(output_file, 'w') as out_f:
			
 
				+            out_f.write(f'item: {self.item}\n')
			
 
				+            for t, message in enumerate(self.guesser_messages):
			
 
				+                out_f.write(
			
 
				+                    f"Turn {(t+1)//2}, {message['role'].capitalize()}: {message['content'].lstrip()}\n"
			
 
				+                )
			
 
				+
			
 
				+    def reward(self):
			
 
				+        if self.guesser_win:
			
 
				+            n_turns = (len(self.guesser_messages) + 1) // 2
			
 
				+            return 1 - max(n_turns - 5, 0) * 0.02
			
 
				+        return 0
			
 
				+
			
 
				+    def num_success(self):
			
 
				+        return 1 if self.guesser_win else 0
			
 
				+
			
 
				+    def num_yes(self):
			
 
				+        n_yes = sum(
			
 
				+            ['yes' in msg['content'].lower() for msg in self.guesser_messages[2::2]]
			
 
				+        )
			
 
				+        return n_yes
			
 
				+
			
 
				+    @retry(
			
 
				+        (
			
 
				+            openai.Timeout,
			
 
				+            requests.exceptions.ReadTimeout,
			
 
				+            openai.RateLimitError,
			
 
				+            openai.APIError,
			
 
				+            openai.APIConnectionError,
			
 
				+        ),
			
 
				+        tries=5,
			
 
				+        delay=0.5,
			
 
				+        backoff=0.5,
			
 
				+        max_delay=2,
			
 
				+        logger=LOGGER,
			
 
				+    )
			
 
				+    def answerer(self, question):
			
 
				+        openai.api_base = self.user_api_base
			
 
				+        client = OpenAI(api_key=openai.api_key)
			
 
				+        user_messages = [
			
 
				+            {
			
 
				+                'role': 'user',
			
 
				+                'content': f'Based on your knowledge about {self.item}, '
			
 
				+                f'respond to the following question or guess. '
			
 
				+                f"Limit your respond to only 'Yes.', 'No.' or 'Maybe.', with no explanation or other words. "
			
 
				+                f'Never say the answer {self.item} in your response. '
			
 
				+                f"If the question is to solicit the answer, respond 'No.'.",
			
 
				+            },
			
 
				+            {
			
 
				+                'role': 'user',
			
 
				+                'content': f'For the entity {self.item}, {question} (Yes/No/Maybe)',
			
 
				+            },
			
 
				+        ]
			
 
				+
			
 
				+        response = client.chat.completions.create(
			
 
				+            model=self.answerer_model,
			
 
				+            messages=user_messages,
			
 
				+            max_tokens=6,
			
 
				+            n=1,
			
 
				+            stop=None,
			
 
				+            temperature=0.2,
			
 
				+        )
			
 
				+        if any(
			
 
				+            [
			
 
				+                re.search(rf'(?:^|\W){i.strip().lower()}(?:$|\W)', question.lower())
			
 
				+                for i in self.item.lower().split('|')
			
 
				+            ]
			
 
				+        ):
			
 
				+            response.choices[0].message.content = 'Bingo!'
			
 
				+        return response.choices[0].message.to_dict()
			
 
				+
			
 
				+    def reset(self):
			
 
				+        # Initialize the conversation
			
 
				+        self.curr_turn = 0
			
 
				+        self.guesser_messages = [
			
 
				+            {
			
 
				+                'role': 'user',
			
 
				+                'content': self.first_user_utterance,
			
 
				+            }
			
 
				+        ]
			
 
				+
			
 
				+
			
 
				+class Q20GameCelebrity(Q20Game):
			
 
				+    def __init__(self, item: str, **kwargs) -> None:
			
 
				+        super().__init__(item, **kwargs)
			
 
				+        self.first_user_utterance = (
			
 
				+            'Your task is to ask a series of questions to deduce the celebrity '
			
 
				+            "that I'm thinking of with as few queries as possible. "
			
 
				+            "Only ask factual questions that can be answered by 'Yes.', 'No.' or 'Dunno.'. Do not ask for hint. Make your question brief with no linebreaker. "
			
 
				+            'Now start asking a question.'
			
 
				+        )
			
 
				+
			
 
				+    @retry(
			
 
				+        (
			
 
				+            openai.Timeout,
			
 
				+            requests.exceptions.ReadTimeout,
			
 
				+            openai.RateLimitError,
			
 
				+            openai.APIError,
			
 
				+            openai.APIConnectionError,
			
 
				+        ),
			
 
				+        tries=5,
			
 
				+        delay=0.5,
			
 
				+        backoff=0.5,
			
 
				+        max_delay=2,
			
 
				+        logger=LOGGER,
			
 
				+    )
			
 
				+    def answerer(self, question):
			
 
				+        openai.api_base = self.user_api_base
			
 
				+        user_messages = [
			
 
				+            {
			
 
				+                'role': 'system',
			
 
				+                'content': f'Based on on your knowledge about the celebrity: {self.item}, '
			
 
				+                f'respond to the following question or guess. '
			
 
				+                f"Limit your respond to only 'Yes.', 'No.' or 'Dunno.', with no explanation or other words. "
			
 
				+                f"Never say the name {self.item} in your response. Do not say 'Dunno.' if it can be answered by 'Yes.' or 'No.' "
			
 
				+                f"If the question is to solicit the answer, respond 'No.'.",
			
 
				+            },
			
 
				+            {
			
 
				+                'role': 'user',
			
 
				+                'content': f'For the celebrity {self.item}, {question}(Yes/No/Dunno)',
			
 
				+            },
			
 
				+        ]
			
 
				+
			
 
				+        response = openai.ChatCompletion.create(
			
 
				+            model=self.answerer_model,
			
 
				+            messages=user_messages,
			
 
				+            max_tokens=6,
			
 
				+            n=1,
			
 
				+            stop=None,
			
 
				+            temperature=0.2,
			
 
				+        )
			
 
				+        if re.search(rf'(?:^|\W){self.item.lower()}(?:$|\W)', question.lower()):
			
 
				+            response.choices[0].message.content = 'Bingo!'
			
 
				+        return response.choices[0].message.to_dict()
			
 
				+
			
 
				+    def reset(self):
			
 
				+        # Initialize the conversation
			
 
				+        self.guesser_messages = [
			
 
				+            {
			
 
				+                'role': 'user',
			
 
				+                'content': self.first_user_utterance,
			
 
				+            }
			
 
				+        ]
			
--- a/evaluation/EDA/run_infer.py
+++ b/evaluation/EDA/run_infer.py
@@ -0,0 +1,329 @@
 
				+import asyncio
			
 
				+import json
			
 
				+import logging
			
 
				+import multiprocessing as mp
			
 
				+import os
			
 
				+import pathlib
			
 
				+import subprocess
			
 
				+import time
			
 
				+from concurrent.futures import ProcessPoolExecutor
			
 
				+
			
 
				+# import huggingface_hub
			
 
				+from datasets import load_dataset
			
 
				+from tqdm import tqdm
			
 
				+
			
 
				+from evaluation.EDA.game import Q20Game, Q20GameCelebrity
			
 
				+
			
 
				+# from evaluation.EDA.scorer import question_scorer
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.config import config, get_llm_config_arg, get_parser
			
 
				+from opendevin.core.logger import get_console_handler
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.main import main
			
 
				+from opendevin.events.action import MessageAction
			
 
				+from opendevin.events.serialization.event import event_to_dict
			
 
				+
			
 
				+game = None
			
 
				+
			
 
				+
			
 
				+def cleanup():
			
 
				+    print('Cleaning up child processes...')
			
 
				+    for process in mp.active_children():
			
 
				+        print(f'Terminating child process: {process.name}')
			
 
				+        process.terminate()
			
 
				+        process.join()
			
 
				+
			
 
				+
			
 
				+def codeact_user_response(state: State) -> str:
			
 
				+    global game
			
 
				+    model_guess = ''
			
 
				+    if state.history:
			
 
				+        for act, _ in reversed(state.history):
			
 
				+            if isinstance(act, MessageAction) and act.source == 'agent':
			
 
				+                model_guess = act.content
			
 
				+                break
			
 
				+    msg = game.generate_user_response(model_guess)
			
 
				+    game.curr_turn += 1
			
 
				+    logger.info(f'Model guess: {model_guess}')
			
 
				+    logger.info(f'Anwser response: {msg}')
			
 
				+    return msg
			
 
				+
			
 
				+
			
 
				+def monologue_user_response(state: State) -> str:
			
 
				+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
			
 
				+
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+    'MonologueAgent': monologue_user_response,
			
 
				+}
			
 
				+
			
 
				+AGENT_CLS_TO_INST_SUFFIX = {
			
 
				+    'CodeActAgent': 'When you think you have solved the question, please first send your answer to user through message and then exit.\n'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def process_instance(instance, agent_class, metadata, reset_logger: bool = True):
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
			
 
				+    eval_output_dir = metadata['eval_output_dir']
			
 
				+    if reset_logger:
			
 
				+        # Set up logger
			
 
				+        log_file = os.path.join(
			
 
				+            eval_output_dir, 'logs', f'instance_{instance["text"].strip()}.log'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        # add back the console handler to print ONE line
			
 
				+        logger.addHandler(get_console_handler())
			
 
				+        logger.info(
			
 
				+            f'Starting evaluation for instance {instance["text"].strip()}.\nLOG:   tail -f {log_file}'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(
			
 
				+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        )
			
 
				+        logger.addHandler(file_handler)
			
 
				+
			
 
				+    # Prepare instruction
			
 
				+    _game_class = {'things': Q20Game, 'celebs': Q20GameCelebrity}
			
 
				+
			
 
				+    guesser_kargs = {
			
 
				+        'max_new_tokens': 64,
			
 
				+        'temperature': 0.8,
			
 
				+        'repetition_penalty': 1.0,
			
 
				+        'do_sample': True,
			
 
				+    }  # no penalty
			
 
				+
			
 
				+    # Use codeactagent as guesser_model
			
 
				+    global game
			
 
				+    game = _game_class[metadata['dataset']](
			
 
				+        item=instance['text'].strip(),
			
 
				+        answerer_model=metadata['answerer_model'],
			
 
				+        guesser_model=None,
			
 
				+        num_turns=metadata['max_iterations'],
			
 
				+        openai_api_key=metadata['openai_api'],
			
 
				+        guesser_kargs=guesser_kargs,
			
 
				+    )
			
 
				+
			
 
				+    instruction = f'{game.first_user_utterance}'
			
 
				+    logger.info(f'Instruction: {instruction}')
			
 
				+
			
 
				+    # instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
			
 
				+    # NOTE: You can actually set slightly different instruction for different agents
			
 
				+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
			
 
				+
			
 
				+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+
			
 
				+    state: State = asyncio.run(
			
 
				+        main(
			
 
				+            instruction,
			
 
				+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
			
 
				+        )
			
 
				+    )
			
 
				+    # ======= Attempt to evaluate the agent's edits =======
			
 
				+    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
			
 
				+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				+
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+
			
 
				+    final_message = ''
			
 
				+    for act, _ in reversed(state.history):
			
 
				+        if isinstance(act, MessageAction) and act.source == 'agent':
			
 
				+            final_message = act.content
			
 
				+            break
			
 
				+
			
 
				+    logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
			
 
				+    test_result = game.reward()
			
 
				+
			
 
				+    # Save the output
			
 
				+    output = {
			
 
				+        'instance_id': instance['text'].strip(),
			
 
				+        'instance': instance,
			
 
				+        'instruction': instruction,
			
 
				+        'metadata': metadata,
			
 
				+        'history': [
			
 
				+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				+        ],
			
 
				+        'error': state.error if state and state.error else None,
			
 
				+        'test_result': {
			
 
				+            'success': test_result,
			
 
				+            'final_message': final_message,
			
 
				+            'ground_truth': instance['text'],
			
 
				+        },
			
 
				+    }
			
 
				+
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--answerer_model', '-a', default='gpt-3.5-turbo', help='answerer model'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--dataset',
			
 
				+        default='things',
			
 
				+        choices=['things', 'celebs'],
			
 
				+        type=str,
			
 
				+        help='dataset to be used',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--OPENAI_API_KEY', type=str, required=True, help='Your OpenAI API key'
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--data-split',
			
 
				+        default='test',
			
 
				+        type=str,
			
 
				+        help='data split, eg, test',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+    if args.directory:
			
 
				+        config.workspace_base = os.path.abspath(args.directory)
			
 
				+        print(f'Setting workspace base to {config.workspace_base}')
			
 
				+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
			
 
				+    # so we don't need to manage file uploading to OpenDevin's repo
			
 
				+    eda_dataset = load_dataset(
			
 
				+        'yizheapple/entity-deduction-arena', name=args.dataset, split=args.data_split
			
 
				+    )
			
 
				+    logger.info(
			
 
				+        f'Evaluating Entity Deduction Arena {args.dataset} {args.data_split} split'
			
 
				+    )
			
 
				+
			
 
				+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				+    # for details of how to set `llm_config`
			
 
				+    if args.llm_config:
			
 
				+        specified_llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        if specified_llm_config:
			
 
				+            config.llm = specified_llm_config
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+
			
 
				+    # TEST METADATA
			
 
				+    agent_class = args.agent_cls
			
 
				+    assert (
			
 
				+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
			
 
				+    ), f'Unsupported agent class: {agent_class}'
			
 
				+    model_name = config.llm.model.split('/')[-1]
			
 
				+    max_iterations = args.max_iterations
			
 
				+    eval_note = ''
			
 
				+    if args.eval_note is not None:
			
 
				+        eval_note += '_N_' + args.eval_note
			
 
				+    eval_output_dir = os.path.join(
			
 
				+        args.eval_output_dir,
			
 
				+        'eda',
			
 
				+        agent_class,
			
 
				+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				+    )
			
 
				+
			
 
				+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
			
 
				+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
			
 
				+        parents=True, exist_ok=True
			
 
				+    )
			
 
				+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
			
 
				+
			
 
				+    metadata = {
			
 
				+        'dataset': args.dataset,
			
 
				+        'data_split': args.data_split,
			
 
				+        'answerer_model': args.answerer_model,
			
 
				+        'agent_class': agent_class,
			
 
				+        'openai_api': args.OPENAI_API_KEY,
			
 
				+        'model_name': model_name,
			
 
				+        'max_iterations': max_iterations,
			
 
				+        'eval_output_dir': eval_output_dir,
			
 
				+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        # get the commit id of current repo for reproduciblity
			
 
				+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
			
 
				+        .decode('utf-8')
			
 
				+        .strip(),
			
 
				+    }
			
 
				+    logger.info(f'Metadata: {metadata}')
			
 
				+    with open(os.path.join(eval_output_dir, 'metadata.json'), 'w') as f:
			
 
				+        json.dump(metadata, f)
			
 
				+
			
 
				+    # LIMIT EVALUATION
			
 
				+    eval_n_limit = args.eval_n_limit
			
 
				+    if eval_n_limit:
			
 
				+        eda_dataset = eda_dataset.select(list(range(eval_n_limit)))
			
 
				+        logger.info(f'Limiting evaluation to first {eval_n_limit} instances.')
			
 
				+
			
 
				+    # OUTPUT FILE
			
 
				+    output_file = os.path.join(eval_output_dir, 'output.jsonl')
			
 
				+    logger.info(f'Writing evaluation output to {output_file}')
			
 
				+    finished_items = set()
			
 
				+    if os.path.exists(output_file):
			
 
				+        with open(output_file, 'r') as f:
			
 
				+            for line in f:
			
 
				+                data = json.loads(line)
			
 
				+                finished_items.add(data['instance_id'])
			
 
				+        logger.warning(
			
 
				+            f'Output file {output_file} already exists. Loaded {len(finished_items)} finished instances.'
			
 
				+        )
			
 
				+    output_fp = open(output_file, 'a')
			
 
				+
			
 
				+    logger.info(
			
 
				+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
			
 
				+    )
			
 
				+
			
 
				+    # =============================================
			
 
				+    # filter out finished instances
			
 
				+    new_eda_dataset = []
			
 
				+    for instance in eda_dataset:
			
 
				+        if instance['text'].strip() in finished_items:
			
 
				+            logger.info(
			
 
				+                f'Skipping instance {instance["text"].strip()} as it is already finished.'
			
 
				+            )
			
 
				+            continue
			
 
				+        new_eda_dataset.append(instance)
			
 
				+
			
 
				+    eda_dataset = new_eda_dataset
			
 
				+    logger.info(
			
 
				+        f'Finished instances: {len(finished_items)}, Remaining instances: {len(eda_dataset)}'
			
 
				+    )
			
 
				+    # =============================================
			
 
				+
			
 
				+    pbar = tqdm(total=len(eda_dataset))
			
 
				+
			
 
				+    # This function tracks the progress AND write the output to a JSONL file
			
 
				+    def update_progress(future):
			
 
				+        pbar.update(1)
			
 
				+        output = future.result()
			
 
				+        pbar.set_description(f'Instance {output["instance_id"]}')
			
 
				+        pbar.set_postfix_str(f'Test Result: {output["test_result"]}')
			
 
				+        logger.info(
			
 
				+            f'Finished evaluation for instance {output["instance_id"]}: {output["test_result"]}'
			
 
				+        )
			
 
				+        output_fp.write(json.dumps(output) + '\n')
			
 
				+        output_fp.flush()
			
 
				+
			
 
				+    # This sets the multi-processing
			
 
				+    num_workers = args.eval_num_workers
			
 
				+    logger.info(f'Using {num_workers} workers for evaluation.')
			
 
				+
			
 
				+    try:
			
 
				+        with ProcessPoolExecutor(num_workers) as executor:
			
 
				+            futures = []
			
 
				+            # This is how we perform multi-processing
			
 
				+            for instance in eda_dataset:
			
 
				+                future = executor.submit(
			
 
				+                    process_instance,
			
 
				+                    instance,
			
 
				+                    agent_class,
			
 
				+                    metadata,
			
 
				+                    reset_logger=bool(num_workers > 1),
			
 
				+                )
			
 
				+                future.add_done_callback(update_progress)
			
 
				+                futures.append(future)
			
 
				+
			
 
				+            # Wait for all futures to complete
			
 
				+            for future in futures:
			
 
				+                future.result()
			
 
				+    except KeyboardInterrupt:
			
 
				+        print('KeyboardInterrupt received. Cleaning up...')
			
 
				+        cleanup()
			
 
				+
			
 
				+    output_fp.close()
			
 
				+    logger.info('Evaluation finished.')
			
--- a/evaluation/EDA/scripts/run_infer.sh
+++ b/evaluation/EDA/scripts/run_infer.sh
@@ -0,0 +1,49 @@
 
				+#!/bin/bash
			
 
				+MODEL_CONFIG=$1
			
 
				+AGENT=$2
			
 
				+DATASET=$3
			
 
				+EVAL_LIMIT=$4
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$DATASET" ]; then
			
 
				+  echo "Dataset not specified, use default 'things'"
			
 
				+  DATASET="things"
			
 
				+fi
			
 
				+
			
 
				+# check if OPENAI_API_KEY is set
			
 
				+if [ -z "$OPENAI_API_KEY" ]; then
			
 
				+  echo "OPENAI_API_KEY is not set, please set it to run the script"
			
 
				+  exit 1
			
 
				+fi
			
 
				+
			
 
				+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				+# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+echo "DATASET: $DATASET"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/EDA/run_infer.py \
			
 
				+  --agent-cls $AGENT \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --dataset $DATASET \
			
 
				+  --data-split test \
			
 
				+  --max-iterations 20 \
			
 
				+  --OPENAI_API_KEY $OPENAI_API_KEY \
			
 
				+  --max-chars 10000000 \
			
 
				+  --eval-num-workers 1 \
			
 
				+  --eval-note ${AGENT_VERSION}_${DATASET}"
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -15,6 +15,7 @@ all the preprocessing/evaluation/analysis scripts.
 
				 - SWE-Bench: [`evaluation/swe_bench`](./swe_bench)
			
 
				 - HumanEvalFix: [`evaluation/humanevalfix`](./humanevalfix)
			
 
				 - GAIA: [`evaluation/gaia`](./gaia)
			
 
				+- Entity deduction Arena (EDA): [`evaluation/EDA`](./EDA)
			
 
				 
			
 
				 ### Result Visualization
			
 
				 
			
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
 
				-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
			
 
				+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
			
 
				 
			
 
				 [[package]]
			
 
				 name = "aenum"
			
@@ -1169,6 +1169,17 @@ tests = ["Pillow (>=6.2.1)", "absl-py", "apache-beam (>=2.26.0)", "elasticsearch
 
				 torch = ["torch"]
			
 
				 vision = ["Pillow (>=6.2.1)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "decorator"
			
 
				+version = "5.1.1"
			
 
				+description = "Decorators for Humans"
			
 
				+optional = false
			
 
				+python-versions = ">=3.5"
			
 
				+files = [
			
 
				+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
			
 
				+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "deprecated"
			
 
				 version = "1.2.14"
			
@@ -3093,13 +3104,9 @@ files = [
 
				     {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"},
			
 
				-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"},
			
 
				-    {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"},
			
 
				-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"},
			
 
				-    {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"},
			
 
				     {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"},
			
@@ -5497,6 +5504,21 @@ requests = ">=2.0.0"
 
				 [package.extras]
			
 
				 rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "retry"
			
 
				+version = "0.9.2"
			
 
				+description = "Easy to use retry decorator."
			
 
				+optional = false
			
 
				+python-versions = "*"
			
 
				+files = [
			
 
				+    {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
			
 
				+    {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+decorator = ">=3.4.2"
			
 
				+py = ">=1.4.26,<2.0.0"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "rich"
			
 
				 version = "13.7.1"
			
@@ -7526,4 +7548,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
 
				 [metadata]
			
 
				 lock-version = "2.0"
			
 
				 python-versions = "^3.11"
			
 
				-content-hash = "141771396f59fc23d52623ada07e4b89272ca781e5a2072f98ebccdf3f18a43b"
			
 
				+content-hash = "70be72e8064824ea756bf2543c8588e266a980e0e6dbc1fc50eecfb365c707d9"
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,6 +69,7 @@ concurrency = ["gevent"]
 
				 [tool.poetry.group.evaluation.dependencies]
			
 
				 streamlit = "*"
			
 
				 whatthepatch = "*"
			
 
				+retry = "*"
			
 
				 evaluate = "*"
			
 
				 
			
 
				 [build-system]