1 жил өмнө · 82d4d25b09
--- a/evaluation/toolqa/README.md
+++ b/evaluation/toolqa/README.md
@@ -0,0 +1,45 @@
 
				+# ToolQA Evaluation with OpenDevin
			
 
				+
			
 
				+This folder contains an evaluation harness we built on top of the original [ToolQA](https://github.com/night-chen/ToolQA) ([paper](https://arxiv.org/pdf/2306.13304)).
			
 
				+
			
 
				+## Setup Environment
			
 
				+
			
 
				+Please follow [this document](https://github.com/OpenDevin/OpenDevin/blob/main/Development.md) to setup local development environment for OpenDevin.
			
 
				+
			
 
				+## Configure OpenDevin and your LLM
			
 
				+
			
 
				+Run `make setup-config` to set up the `config.toml` file if it does not exist at the root of the workspace.
			
 
				+
			
 
				+## Run Inference on ToolQA Instances
			
 
				+
			
 
				+Make sure your Docker daemon is running, then run this bash script:
			
 
				+
			
 
				+```bash
			
 
				+bash evaluation/toolqa/scripts/run_infer.sh [model_config] [agent] [eval_limit] [dataset] [hardness] [wolfram_alpha_appid]
			
 
				+```
			
 
				+
			
 
				+where `model_config` is mandatory, while all other arguments are optional.
			
 
				+
			
 
				+`model_config`, e.g. `llm`, is the config group name for your
			
 
				+LLM settings, as defined in your `config.toml`.
			
 
				+
			
 
				+`agent`, e.g. `CodeActAgent`, is the name of the agent for benchmarks, defaulting
			
 
				+to `CodeActAgent`.
			
 
				+
			
 
				+`eval_limit`, e.g. `10`, limits the evaluation to the first `eval_limit` instances.
			
 
				+By default, the script evaluates 1 instance.
			
 
				+
			
 
				+`dataset`, the dataset from ToolQA to evaluate from. You could choose from `agenda`, `airbnb`, `coffee`, `dblp`, `flight`, `gsm8k`, `scirex`, `yelp` for dataset. The default is `coffee`.
			
 
				+
			
 
				+`hardness`, the hardness to evaluate. You could choose from `easy` and `hard`. The default is `easy`.
			
 
				+
			
 
				+`wolfram_alpha_appid` is an optional argument. When given `wolfram_alpha_appid`, the agent will be able to access Wolfram Alpha's APIs.
			
 
				+
			
 
				+Note: in order to use `eval_limit`, you must also set `agent`; in order to use `dataset`, you must also set `eval_limit`; in order to use `hardness`, you must also set `dataset`.
			
 
				+
			
 
				+Let's say you'd like to run 10 instances using `llm` and CodeActAgent on `coffee` `easy` test,
			
 
				+then your command would be:
			
 
				+
			
 
				+```bash
			
 
				+bash evaluation/toolqa/scripts/run_infer.sh llm CodeActAgent 10 coffee easy
			
 
				+```
			
--- a/evaluation/toolqa/run_infer.py
+++ b/evaluation/toolqa/run_infer.py
@@ -0,0 +1,353 @@
 
				+import asyncio
			
 
				+import json
			
 
				+import logging
			
 
				+import multiprocessing as mp
			
 
				+import os
			
 
				+import pathlib
			
 
				+import subprocess
			
 
				+import time
			
 
				+from concurrent.futures import ProcessPoolExecutor
			
 
				+
			
 
				+from tqdm import tqdm
			
 
				+from utils import download_data, download_tools, encode_question, eval_answer, get_data
			
 
				+
			
 
				+from opendevin.controller.state.state import State
			
 
				+from opendevin.core.config import config, get_llm_config_arg, get_parser
			
 
				+from opendevin.core.logger import get_console_handler
			
 
				+from opendevin.core.logger import opendevin_logger as logger
			
 
				+from opendevin.core.main import main
			
 
				+from opendevin.events.action import MessageAction
			
 
				+from opendevin.events.serialization.event import event_to_dict
			
 
				+
			
 
				+
			
 
				+def cleanup():
			
 
				+    print('Cleaning up child processes...')
			
 
				+    for process in mp.active_children():
			
 
				+        print(f'Terminating child process: {process.name}')
			
 
				+        process.terminate()
			
 
				+        process.join()
			
 
				+
			
 
				+
			
 
				+def codeact_user_response(state: State) -> str:
			
 
				+    msg = (
			
 
				+        'Please continue working on the task on whatever approach you think is suitable.\n'
			
 
				+        'When you think you finished the task, respond with `Finish[answer]` where you include your answer in `[]`\n'
			
 
				+        'IMPORTANT: YOU SHOULD NEVER ASK FOR HUMAN HELP OR USE THE INTERNET TO SOLVE THIS TASK.\n'
			
 
				+    )
			
 
				+    if state.history:
			
 
				+        user_msgs = [
			
 
				+            action
			
 
				+            for action, _ in state.history
			
 
				+            if isinstance(action, MessageAction) and action.source == 'user'
			
 
				+        ]
			
 
				+        if len(user_msgs) >= 2:
			
 
				+            # let the agent know that it can give up when it has tried 3 times
			
 
				+            return (
			
 
				+                msg
			
 
				+                + 'If you want to give up, run: <execute_bash> exit </execute_bash>.\n'
			
 
				+            )
			
 
				+    return msg
			
 
				+
			
 
				+
			
 
				+def monologue_user_response(state: State) -> str:
			
 
				+    raise NotImplementedError('MonologueAgent should never ask for user responses.')
			
 
				+
			
 
				+
			
 
				+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
			
 
				+    'CodeActAgent': codeact_user_response,
			
 
				+    'MonologueAgent': monologue_user_response,
			
 
				+}
			
 
				+
			
 
				+AGENT_CLS_TO_INST_SUFFIX = {
			
 
				+    'CodeActAgent': 'When you think you have completed the request, please run the following command: <execute_bash> exit </execute_bash>.\n'
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def process_instance(task, agent_class, metadata, reset_logger: bool = True):
			
 
				+    # create process-specific workspace dir
			
 
				+    # we will create a workspace directory for EACH process
			
 
				+    # so that different agent don't interfere with each other.
			
 
				+    workspace_mount_path = config.workspace_mount_path
			
 
				+    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+    # Setup the logger properly, so you can run multi-processing to parallize the evaluation
			
 
				+    eval_output_dir = metadata['eval_output_dir']
			
 
				+    qid = task['qid']
			
 
				+    question = task['question']
			
 
				+    answer = task['answer']
			
 
				+    if reset_logger:
			
 
				+        # Set up logger
			
 
				+        log_file = os.path.join(eval_output_dir, 'logs', f'instance_{qid}.log')
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        # add back the console handler to print ONE line
			
 
				+        logger.addHandler(get_console_handler())
			
 
				+        logger.info(
			
 
				+            f'Starting evaluation for instance {qid}.\nHint: run "tail -f {log_file}" to see live logs in a seperate shell'
			
 
				+        )
			
 
				+        # Remove all existing handlers from logger
			
 
				+        for handler in logger.handlers[:]:
			
 
				+            logger.removeHandler(handler)
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setFormatter(
			
 
				+            logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        )
			
 
				+        logger.addHandler(file_handler)
			
 
				+    logger.info(f'Process-specific workspace mounted at {workspace_mount_path}')
			
 
				+
			
 
				+    # Prepare instruction
			
 
				+    instruction = encode_question(question)
			
 
				+    instruction += 'IMPORTANT: You should ONLY interact with the environment provided to you AND NEVER ASK FOR HUMAN HELP.\n'
			
 
				+    # NOTE: You can actually set slightly different instruction for different agents
			
 
				+    instruction += AGENT_CLS_TO_INST_SUFFIX.get(agent_class, '')
			
 
				+    # logger.info(f'Instruction:\n{instruction}', extra={'msg_type': 'OBSERVATION'})
			
 
				+
			
 
				+    # Here's how you can run the agent (similar to the `main` function) and get the final task state
			
 
				+    state: State = asyncio.run(
			
 
				+        main(
			
 
				+            instruction,
			
 
				+            fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(agent_class),
			
 
				+        )
			
 
				+    )
			
 
				+    # ======= Attempt to evaluate the agent's edits =======
			
 
				+    # If you are working on simplier benchmark that only evaluates the final model output (e.g., in a MessageAction)
			
 
				+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
			
 
				+
			
 
				+    if state is None:
			
 
				+        raise ValueError('State should not be None.')
			
 
				+
			
 
				+    model_answer_raw = ''
			
 
				+    for act, _ in reversed(state.history):
			
 
				+        if isinstance(act, MessageAction) and act.source == 'agent':
			
 
				+            model_answer_raw = act.content
			
 
				+            break
			
 
				+    # attempt to parse model_answer
			
 
				+    correct = eval_answer(str(model_answer_raw), str(answer))
			
 
				+    metrics = state.metrics.get() if state.metrics else None
			
 
				+    logger.info(f'Final message: {model_answer_raw} | Correctness: {correct}')
			
 
				+    # Save the output
			
 
				+    output = {
			
 
				+        'qid': qid,
			
 
				+        'text': model_answer_raw,
			
 
				+        'correct': correct,
			
 
				+        'answer_id': 'None',
			
 
				+        'model_id': metadata['model_name'],
			
 
				+        'metadata': metadata,
			
 
				+        'history': [
			
 
				+            (event_to_dict(action), event_to_dict(obs)) for action, obs in state.history
			
 
				+        ],
			
 
				+        'metrics': metrics,
			
 
				+        'error': state.error if state and state.error else None,
			
 
				+    }
			
 
				+    return output
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = get_parser()
			
 
				+    parser.add_argument(
			
 
				+        '--dataset',
			
 
				+        type=str,
			
 
				+        help='Which dataset to evaluate from ToolQA. ToolQA contains 8 datasets, namely agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp. For example, the default is --dataset flight.',
			
 
				+        default='flight',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--hardness',
			
 
				+        type=str,
			
 
				+        help='Which level of difficulty to evaluate from ToolQA. ToolQA contains 2 levels of hardness, namely easy and hard. For example, the default is --hardness easy.',
			
 
				+        default='easy',
			
 
				+    )
			
 
				+    parser.add_argument(
			
 
				+        '--wolfram_alpha_appid',
			
 
				+        type=str,
			
 
				+        help='wolfram alpha appid to use for wolfram alpha related tests',
			
 
				+        default='YOUR_WOLFRAMALPHA_APPID',
			
 
				+    )
			
 
				+    args, _ = parser.parse_known_args()
			
 
				+    if args.directory:
			
 
				+        config.workspace_base = os.path.abspath(args.directory)
			
 
				+        print(f'Setting workspace base to {config.workspace_base}')
			
 
				+    # Check https://github.com/OpenDevin/OpenDevin/blob/main/evaluation/swe_bench/README.md#configure-opendevin-and-your-llm
			
 
				+    # for details of how to set `llm_config`
			
 
				+    if args.llm_config:
			
 
				+        specified_llm_config = get_llm_config_arg(args.llm_config)
			
 
				+        if specified_llm_config:
			
 
				+            config.llm = specified_llm_config
			
 
				+    logger.info(f'Config for evaluation: {config}')
			
 
				+    agent_class = args.agent_cls
			
 
				+    assert (
			
 
				+        agent_class in AGENT_CLS_TO_FAKE_USER_RESPONSE_FN
			
 
				+    ), f'Unsupported agent class: {agent_class}'
			
 
				+    model_name = config.llm.model.split('/')[-1]
			
 
				+    max_iterations = args.max_iterations
			
 
				+    eval_note = ''
			
 
				+    if args.eval_note is not None:
			
 
				+        eval_note += '_N_' + args.eval_note
			
 
				+    eval_output_dir = os.path.join(
			
 
				+        args.eval_output_dir,
			
 
				+        'toolqa',
			
 
				+        agent_class,
			
 
				+        model_name + '_maxiter_' + str(max_iterations) + eval_note,
			
 
				+    )
			
 
				+    pathlib.Path(eval_output_dir).mkdir(parents=True, exist_ok=True)
			
 
				+    pathlib.Path(os.path.join(eval_output_dir, 'logs')).mkdir(
			
 
				+        parents=True, exist_ok=True
			
 
				+    )
			
 
				+    logger.info(f'Using evaluation output directory: {eval_output_dir}')
			
 
				+
			
 
				+    dataset = ''
			
 
				+    hardness = ''
			
 
				+    dataset_choices = [
			
 
				+        'agenda',
			
 
				+        'airbnb',
			
 
				+        'coffee',
			
 
				+        'dblp',
			
 
				+        'flight',
			
 
				+        'gsm8k',
			
 
				+        'scirex',
			
 
				+        'yelp',
			
 
				+        'genda',
			
 
				+    ]
			
 
				+    if args.dataset in dataset_choices:
			
 
				+        dataset = args.dataset
			
 
				+    else:
			
 
				+        raise ValueError(
			
 
				+            'Please choose from agenda, airbnb, coffee, dblp, flight, gsm8k, scirex, yelp for dataset.'
			
 
				+        )
			
 
				+    if args.hardness == 'easy':
			
 
				+        hardness = 'easy'
			
 
				+    elif args.hardness == 'hard':
			
 
				+        hardness = 'hard'
			
 
				+    else:
			
 
				+        raise ValueError('Please choose from easy and hard for hardness.')
			
 
				+
			
 
				+    logger.info(f'Evaluating ToolQA {dataset} {hardness} test')
			
 
				+    # workspace_mount_path = os.path.join(config.workspace_mount_path, '_eval_workspace')
			
 
				+    workspace_mount_path = config.workspace_mount_path
			
 
				+    pathlib.Path(workspace_mount_path).mkdir(parents=True, exist_ok=True)
			
 
				+    toolqa_test = get_data(dataset, hardness)
			
 
				+    toolqa_data_path = download_data(workspace_mount_path)
			
 
				+    toolqa_tool_path = download_tools(workspace_mount_path, args.wolfram_alpha_appid)
			
 
				+
			
 
				+    # TEST METADATA
			
 
				+    metadata = {
			
 
				+        'dataset': dataset,
			
 
				+        'hardness': hardness,
			
 
				+        'agent_class': agent_class,
			
 
				+        'model_name': model_name,
			
 
				+        'max_iterations': max_iterations,
			
 
				+        'eval_output_dir': eval_output_dir,
			
 
				+        'start_time': time.strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+        # get the commit id of current repo for reproduciblity
			
 
				+        'git_commit': subprocess.check_output(['git', 'rev-parse', 'HEAD'])
			
 
				+        .decode('utf-8')
			
 
				+        .strip(),
			
 
				+    }
			
 
				+    logger.info(f'Metadata: {metadata}')
			
 
				+    with open(
			
 
				+        os.path.join(eval_output_dir, f'metadata_{dataset}_{hardness}.json'), 'w'
			
 
				+    ) as f:
			
 
				+        json.dump(metadata, f)
			
 
				+    # LIMIT EVALUATION
			
 
				+    eval_n_limit = args.eval_n_limit
			
 
				+    if eval_n_limit:
			
 
				+        toolqa_test = toolqa_test[:eval_n_limit]
			
 
				+        logger.info(
			
 
				+            f'Limiting evaluation to a total of first {eval_n_limit} instances.'
			
 
				+        )
			
 
				+    output_file = os.path.join(
			
 
				+        eval_output_dir, f'output_{model_name}_{dataset}_{hardness}.jsonl'
			
 
				+    )
			
 
				+    logger.info(f'Writing evaluation output to {output_file}')
			
 
				+    finished_task_ids = set()
			
 
				+    if os.path.exists(output_file):
			
 
				+        with open(output_file, 'r') as f:
			
 
				+            for line in f:
			
 
				+                task = json.loads(line)
			
 
				+                finished_task_ids.add(task['qid'])
			
 
				+        logger.warning(
			
 
				+            f'Output file {output_file} already exists. Loaded {len(finished_task_ids)} finished instances.'
			
 
				+        )
			
 
				+    output_fp = open(output_file, 'a')
			
 
				+    logger.info(
			
 
				+        f'Evaluation started with Agent {agent_class}, model {model_name}, max iterations {max_iterations}.'
			
 
				+    )
			
 
				+    # =============================================
			
 
				+    # filter out finished instances
			
 
				+    new_toolqa_test = []
			
 
				+    for task in toolqa_test:
			
 
				+        qid = task['qid']
			
 
				+        if qid in finished_task_ids:
			
 
				+            logger.info(f'Skipping instance {qid} as it is already finished.')
			
 
				+            continue
			
 
				+        new_toolqa_test.append(task)
			
 
				+    finished_task_number = len(finished_task_ids)
			
 
				+    toolqa_test = new_toolqa_test
			
 
				+    logger.info(
			
 
				+        f'Finished instances: {finished_task_number}, Remaining instances: {len(toolqa_test)}'
			
 
				+    )
			
 
				+
			
 
				+    # =============================================
			
 
				+    pbar = tqdm(total=len(toolqa_test))
			
 
				+
			
 
				+    # This function tracks the progress AND write the output to a JSONL file
			
 
				+    def update_progress(future):
			
 
				+        pbar.update(1)
			
 
				+        output = future.result()
			
 
				+        pbar.set_description(f'Instance {output["qid"]}')
			
 
				+        pbar.set_postfix_str(f'Test Result: {output["correct"]}')
			
 
				+        logger.info(
			
 
				+            f'Finished evaluation for instance {output["qid"]}: {output["correct"]}'
			
 
				+        )
			
 
				+        output_fp.write(json.dumps(output) + '\n')
			
 
				+        output_fp.flush()
			
 
				+        finished_task_ids.add(output['qid'])
			
 
				+
			
 
				+    # This sets the multi-processing
			
 
				+    num_workers = args.eval_num_workers
			
 
				+    logger.info(f'Using {num_workers} workers for evaluation.')
			
 
				+    try:
			
 
				+        with ProcessPoolExecutor(num_workers) as executor:
			
 
				+            futures = []
			
 
				+            # This is how we perform multi-processing
			
 
				+            for task in toolqa_test:
			
 
				+                try:
			
 
				+                    future = executor.submit(
			
 
				+                        process_instance,
			
 
				+                        task,
			
 
				+                        agent_class,
			
 
				+                        metadata,
			
 
				+                        reset_logger=bool(num_workers > 1),
			
 
				+                    )
			
 
				+                    future.add_done_callback(update_progress)
			
 
				+                    futures.append(future)
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+            # Wait for all futures to complete
			
 
				+            for future in futures:
			
 
				+                try:
			
 
				+                    future.result()
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+    except KeyboardInterrupt:
			
 
				+        logger.info('KeyboardInterrupt received. Cleaning up...')
			
 
				+        cleanup()
			
 
				+    output_fp.close()
			
 
				+    total_correct = 0
			
 
				+    output = []
			
 
				+    with open(output_file, 'r') as f:
			
 
				+        for line in f:
			
 
				+            data = json.loads(line)
			
 
				+            output.append(data)
			
 
				+            if data['qid'] in finished_task_ids:
			
 
				+                if str(data['correct']).lower() == 'true':
			
 
				+                    total_correct += 1
			
 
				+    # sort all output by question_id
			
 
				+    output = sorted(output, key=lambda x: x['qid'])
			
 
				+    with open(output_file, 'w') as f:
			
 
				+        for dat in output:
			
 
				+            f.write(json.dumps(dat) + '\n')
			
 
				+            f.flush()
			
 
				+    logger.info(
			
 
				+        f'Evaluation finished for {dataset}-{hardness}. Total: {len(toolqa_test)+finished_task_number}; Correct: {total_correct}; Accuracy: {total_correct / (len(toolqa_test)+finished_task_number)}'
			
 
				+    )
			
--- a/evaluation/toolqa/scripts/run_infer.sh
+++ b/evaluation/toolqa/scripts/run_infer.sh
@@ -0,0 +1,58 @@
 
				+#!/bin/bash
			
 
				+MODEL_CONFIG=$1
			
 
				+AGENT=$2
			
 
				+EVAL_LIMIT=$3
			
 
				+DATASET=$4
			
 
				+HARDNESS=$5
			
 
				+WOLFRAM_APPID=$6
			
 
				+
			
 
				+if [ -z "$AGENT" ]; then
			
 
				+  echo "Agent not specified, use default CodeActAgent"
			
 
				+  AGENT="CodeActAgent"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$DATASET" ]; then
			
 
				+  DATASET="flight"
			
 
				+  echo "Dataset not specified, use default $DATASET"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$HARDNESS" ]; then
			
 
				+  HARDNESS="easy"
			
 
				+  echo "Hardness not specified, use default $HARDNESS"
			
 
				+fi
			
 
				+
			
 
				+if [ -z "$WOLFRAM_APPID" ]; then
			
 
				+  WOLFRAM_APPID="YOUR_WOLFRAMALPHA_APPID"
			
 
				+  echo "WOLFRAM_APPID not specified"
			
 
				+fi
			
 
				+
			
 
				+# IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenDevin
			
 
				+# We need to track the version of Agent in the evaluation to make sure results are comparable
			
 
				+AGENT_VERSION=v$(poetry run python -c "import agenthub; from opendevin.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
			
 
				+
			
 
				+echo "AGENT: $AGENT"
			
 
				+echo "AGENT_VERSION: $AGENT_VERSION"
			
 
				+echo "MODEL_CONFIG: $MODEL_CONFIG"
			
 
				+echo "DATASET: $DATASET"
			
 
				+echo "HARDNESS: $HARDNESS"
			
 
				+echo "WOLFRAM_APPID: $WOLFRAM_APPID"
			
 
				+
			
 
				+COMMAND="poetry run python evaluation/toolqa/run_infer.py \
			
 
				+  --agent-cls $AGENT \
			
 
				+  --llm-config $MODEL_CONFIG \
			
 
				+  --max-iterations 30 \
			
 
				+  --dataset $DATASET \
			
 
				+  --hardness $HARDNESS \
			
 
				+  --wolfram_alpha_appid $WOLFRAM_APPID\
			
 
				+  --data-split validation \
			
 
				+  --max-chars 10000000 \
			
 
				+  --eval-num-workers 1 \
			
 
				+  --eval-note ${AGENT_VERSION}_${LEVELS}"
			
 
				+
			
 
				+if [ -n "$EVAL_LIMIT" ]; then
			
 
				+  echo "EVAL_LIMIT: $EVAL_LIMIT"
			
 
				+  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
			
 
				+fi
			
 
				+
			
 
				+# Run the command
			
 
				+eval $COMMAND
			
--- a/evaluation/toolqa/utils.py
+++ b/evaluation/toolqa/utils.py
@@ -0,0 +1,112 @@
 
				+import json
			
 
				+import os
			
 
				+import re
			
 
				+import string
			
 
				+import zipfile
			
 
				+
			
 
				+import gdown
			
 
				+import requests
			
 
				+
			
 
				+
			
 
				+def download_data(dir):
			
 
				+    data_path = os.path.join(dir, 'data/external_corpus')
			
 
				+    if os.path.exists(data_path):
			
 
				+        return data_path
			
 
				+    url = 'https://drive.google.com/uc?id=1zRbHzPW2x4dDcfmphBWlan8cxUCRNmqk'
			
 
				+    zip_path = os.path.join(dir, 'data.zip')
			
 
				+    gdown.download(url, zip_path, quiet=False)
			
 
				+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
			
 
				+        zip_ref.extractall(os.path.join(dir, 'data'))
			
 
				+    if os.path.exists(zip_path):
			
 
				+        os.remove(zip_path)
			
 
				+    return data_path
			
 
				+
			
 
				+
			
 
				+def download_tools(dir, wolfram_alpha_appid='YOUR_WOLFRAMALPHA_APPID'):
			
 
				+    tool_path = os.path.join(dir, 'tools')
			
 
				+    if os.path.exists(tool_path):
			
 
				+        return tool_path
			
 
				+    os.mkdir(tool_path)
			
 
				+    tools = [
			
 
				+        'code/sql_interpreter.py',
			
 
				+        'graph/graphtools.py',
			
 
				+        'math/calculator.py',
			
 
				+        'table/mysql_db_create.py',
			
 
				+        'table/tabtools.py',
			
 
				+        'text/agenda_retriever.py',
			
 
				+        'text/scirex_retriever.py',
			
 
				+    ]
			
 
				+    for tool in tools:
			
 
				+        url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/benchmark/ReAct/code/tools/{tool}'
			
 
				+        response = requests.get(url)
			
 
				+        output_file = os.path.join(tool_path, tool.split('/')[1])
			
 
				+        with open(output_file, 'wb') as f:
			
 
				+            f.write(response.content)
			
 
				+    with open(os.path.join(tool_path, 'calculator.py'), 'r') as f:
			
 
				+        content = f.read()
			
 
				+    new_content = content.replace('YOUR_WOLFRAMALPHA_APPID', wolfram_alpha_appid)
			
 
				+    with open(os.path.join(tool_path, 'calculator.py'), 'w') as f:
			
 
				+        f.write(new_content)
			
 
				+    with open(os.path.join(tool_path, 'agenda_retriever.py'), 'r') as f:
			
 
				+        content = f.read()
			
 
				+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
			
 
				+    with open(os.path.join(tool_path, 'agenda_retriever.py'), 'w') as f:
			
 
				+        f.write(new_content)
			
 
				+    with open(os.path.join(tool_path, 'mysql_db_create.py'), 'r') as f:
			
 
				+        content = f.read()
			
 
				+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
			
 
				+    with open(os.path.join(tool_path, 'mysql_db_create.py'), 'w') as f:
			
 
				+        f.write(new_content)
			
 
				+    with open(os.path.join(tool_path, 'scirex_retriever.py'), 'r') as f:
			
 
				+        content = f.read()
			
 
				+    new_content = content.replace('/<YOUR_OWN_PATH>/ToolQA/', '')
			
 
				+    with open(os.path.join(tool_path, 'scirex_retriever.py'), 'w') as f:
			
 
				+        f.write(new_content)
			
 
				+
			
 
				+
			
 
				+def get_data(dataset, hardness):
			
 
				+    data = []
			
 
				+    url = f'https://raw.githubusercontent.com/night-chen/ToolQA/main/data/questions/{hardness}/{dataset}-{hardness}.jsonl'
			
 
				+    url = requests.get(url)
			
 
				+    if url.status_code == 200:
			
 
				+        lines = url.text.splitlines()
			
 
				+        for line in lines:
			
 
				+            data.append(json.loads(line))
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+REACT_INSTRUCTION = """Use tools in the tools directory to solve the task: {question}
			
 
				+You could use all tools which are under the tools/ directory and all the data under the data/ directory.
			
 
				+When you think you finished the task, respond with `Finish[answer]` where you include your answer in `[]`.
			
 
				+IMPORTANT: Make sure that in your final answer, you should not print any additional text/instructions other than the actual answer, which should be a word or a simple phrase.
			
 
				+"""
			
 
				+
			
 
				+
			
 
				+def encode_question(question):
			
 
				+    return REACT_INSTRUCTION.format(question=question)
			
 
				+
			
 
				+
			
 
				+# imported from https://github.com/night-chen/ToolQA/tree/main/benchmark/ReAct/code/agents_chatgpt.py
			
 
				+def normalize_answer(s):
			
 
				+    def remove_articles(text):
			
 
				+        return re.sub(r'\b(a|an|the|usd)\b', ' ', text)
			
 
				+
			
 
				+    def white_space_fix(text):
			
 
				+        return ' '.join(text.split())
			
 
				+
			
 
				+    def remove_punc(text):
			
 
				+        exclude = set(string.punctuation)
			
 
				+        return ''.join(ch for ch in text if ch not in exclude)
			
 
				+
			
 
				+    def lower(text):
			
 
				+        return text.lower()
			
 
				+
			
 
				+    return white_space_fix(remove_articles(remove_punc(lower(s))))
			
 
				+
			
 
				+
			
 
				+def eval_answer(pred, answer):
			
 
				+    pattern = r'Finish\[(.*?)\]'
			
 
				+    match = re.search(pattern, pred)
			
 
				+    if match:
			
 
				+        pred = match.group(1)
			
 
				+    return normalize_answer(pred) == normalize_answer(answer)