1 жил өмнө · 7b6ae3638e
--- a/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
+++ b/evaluation/swe_bench/BUILD_TESTBED_AND_ENV.md
@@ -1,39 +0,0 @@
 
				-# Pre-build Testbed and Env
			
 
				-
			
 
				-In the original SWE-Bench implementation, conda environment for evaluation is typically installed from scratch while evaluating on a particular instance. This poses several challenges:
			
 
				-
			
 
				-- Efficiency: most time of evaluation will be wasted on downloading packages
			
 
				-- Stability: setup could failed due to bad internet connectivity
			
 
				-- Reliability: it is possible that an instance is considered failed not because the agent did badly, but because the environment setup failed.
			
 
				-
			
 
				-In OpenDevin-SWE-Bench fork, we try to pre-build the **testbed** (i.e., code of the repository we want the agent to edit) AND the **conda environment**, so that in evaluation (inference) time, we can directly leverage existing environments for efficient evaluation.
			
 
				-
			
 
				-NOTE: We only support SWE-Bench lite for now. But modifying our existing scripts for full SWE-Bench should be quite straight forward.
			
 
				-
			
 
				-## How to pre-build your testbed
			
 
				-
			
 
				-### Setup Eval Workspace (Util + Data)
			
 
				-
			
 
				-Setup your eval workspace by:
			
 
				-1. Clone OpenDevin SWE-Bench [fork](https://github.com/OpenDevin/OD-SWE-bench.git)
			
 
				-2. Prepare SWE-Bench data
			
 
				-
			
 
				-Run the following command to do the above two steps. The results will be saved to `evaluation/SWE-bench/eval_workspace`.
			
 
				-
			
 
				-```bash
			
 
				-./evaluation/swe_bench/scripts/setup/prepare_swe_utils.sh
			
 
				-```
			
 
				-
			
 
				-### Pre-build Conda Env and Test Bed
			
 
				-
			
 
				-```bash
			
 
				-./evaluation/swe_bench/scripts/setup/swe_env_setup.sh
			
 
				-```
			
 
				-
			
 
				-### Build the pre-build conda env and testbed into ONE docker image
			
 
				-
			
 
				-```bash
			
 
				-pushd evaluation/swe_bench
			
 
				-docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
 
				-docker push ghcr.io/opendevin/eval-swe-bench:full-v1.2.1
			
 
				-```
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.builder
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.builder
@@ -1,17 +0,0 @@
 
				-FROM ghcr.io/opendevin/sandbox:main
			
 
				-
			
 
				-RUN apt-get update && \
			
 
				-    apt-get install -y libffi-dev bash gcc git jq wget pkg-config libfreetype-dev libfreetype6 libfreetype6-dev rsync && \
			
 
				-    apt-get clean && \
			
 
				-    rm -rf /var/lib/apt/lists/*
			
 
				-
			
 
				-RUN ln -sfn /bin/bash /bin/sh
			
 
				-RUN mkdir -p /opendevin/logs && chmod 777 /opendevin/logs
			
 
				-
			
 
				-# Setup Git
			
 
				-RUN git config --global user.email "swebench@swebench.ai"
			
 
				-RUN git config --global user.name "swebench"
			
 
				-
			
 
				-CMD ["/bin/bash"]
			
 
				-# pushd evaluation/swe_bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:builder -f ./scripts/docker/Dockerfile.builder .
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.builder_with_conda
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.builder_with_conda
@@ -1,19 +0,0 @@
 
				-FROM ghcr.io/opendevin/eval-swe-bench:builder
			
 
				-
			
 
				-# # Install Mamba/Conda
			
 
				-RUN wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
			
 
				-# install to /opt/miniforge3
			
 
				-RUN mkdir /swe_util
			
 
				-RUN bash Miniforge3-$(uname)-$(uname -m).sh -b -p /swe_util/miniforge3
			
 
				-RUN export PATH=/swe_util/miniforge3/bin:$PATH
			
 
				-RUN /swe_util/miniforge3/bin/mamba init bash
			
 
				-
			
 
				-# Setup SWE-Bench Eval Env
			
 
				-RUN /bin/bash -c "/swe_util/miniforge3/bin/mamba create -n swe-bench-eval python==3.11.5 -y"
			
 
				-RUN /bin/bash -c ". /swe_util/miniforge3/etc/profile.d/conda.sh && conda activate swe-bench-eval && \
			
 
				-pip install requests python-dotenv GitPython datasets pandas beautifulsoup4 ghapi"
			
 
				-RUN /bin/bash -c ". /swe_util/miniforge3/etc/profile.d/conda.sh && conda config --set changeps1 False && conda config --append channels conda-forge"
			
 
				-
			
 
				-CMD ["/bin/bash"]
			
 
				-# pushd evaluation/swe_bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:builder_with_conda -f ./scripts/docker/Dockerfile.builder_with_conda .
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.1
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.1
@@ -1,13 +0,0 @@
 
				-FROM ghcr.io/opendevin/eval-swe-bench:full_deps
			
 
				-
			
 
				-# ================== COPY Smaller things ==================
			
 
				-# copy everything except the folder of `eval_data` or `miniforge3`
			
 
				-# typically, this should be the OD codebase
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    --exclude='eval_data' \
			
 
				-    --exclude='miniforge3' \
			
 
				-    /eval_workspace/ /swe_util/
			
 
				-
			
 
				-# pushd evaluation/SWE-bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.1 -f ./scripts/docker/Dockerfile.full.v1.1 .
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2
@@ -1,12 +0,0 @@
 
				-FROM ghcr.io/opendevin/eval-swe-bench:full-v1.1
			
 
				-
			
 
				-RUN apt-get update && apt-get install -y \
			
 
				-    libgl1-mesa-glx \
			
 
				-    && rm -rf /var/lib/apt/lists/*
			
 
				-
			
 
				-# install basic dependencies for CodeActAgent
			
 
				-RUN pip3 install --upgrade pip
			
 
				-RUN pip3 install jupyterlab notebook jupyter_kernel_gateway flake8
			
 
				-# TODO: those dependencies are needed for agentskills, we should pack them in a new sandbox image
			
 
				-RUN pip3 install python-docx PyPDF2 python-pptx pylatexenc openai opencv-python
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2 -f ./scripts/docker/Dockerfile.full.v1.2 .
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2.1
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full.v1.2.1
@@ -1,10 +0,0 @@
 
				-FROM ghcr.io/opendevin/eval-swe-bench:full-v1.2
			
 
				-
			
 
				-# ================== Update OD-SWE-Bench ==================
			
 
				-# copy everything except the folder of `eval_data` or `miniforge3`
			
 
				-# typically, this should be the OD codebase
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress /eval_workspace/OD-SWE-bench/ /swe_util/OD-SWE-bench
			
 
				-
			
 
				-# pushd evaluation/SWE-bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:full-v1.2.1 -f ./scripts/docker/Dockerfile.full.v1.2.1 .
			
--- a/evaluation/swe_bench/scripts/docker/Dockerfile.full_deps
+++ b/evaluation/swe_bench/scripts/docker/Dockerfile.full_deps
@@ -1,72 +0,0 @@
 
				-FROM ghcr.io/opendevin/eval-swe-bench:builder
			
 
				-
			
 
				-# This Dockerfile is used to build the Docker image for the evaluation of the SWE-Bench.
			
 
				-# YOU SHOULD ENSURE ./eval_workspace CONTAINS THE EVALUATION WORKSPACE (testbed, conda)
			
 
				-# Check BUILD_TESTBED_AND_ENV.md for more details.
			
 
				-
			
 
				-RUN mkdir -p /swe_util
			
 
				-
			
 
				-# Use https://github.com/moby/moby/issues/15771#issuecomment-1762893340
			
 
				-# to copy files from host to container with --exclude
			
 
				-
			
 
				-# # ================== Prepare Eval Data ==================
			
 
				-# Copy everything in eval_data except the "testbeds"
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    --exclude='testbeds' \
			
 
				-    /eval_workspace/eval_data /swe_util/
			
 
				-
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    --exclude='matplotlib*' \
			
 
				-    --exclude='scikit-learn*' \
			
 
				-    /eval_workspace/eval_data/testbeds /swe_util/eval_data/
			
 
				-
			
 
				-# # copy the larger ones in separate layers
			
 
				-# COPY ./eval_workspace/eval_data/testbeds/matplotlib* /swe_util/eval_data/testbeds/
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/eval_data/testbeds/matplotlib* /swe_util/eval_data/testbeds/
			
 
				-
			
 
				-# COPY ./eval_workspace/eval_data/testbeds/scikit-learn* /swe_util/eval_data/testbeds/
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/eval_data/testbeds/scikit-learn* /swe_util/eval_data/testbeds/
			
 
				-
			
 
				-# ================== Prepare Miniconda3 ==================
			
 
				-# Copy the Miniconda3 environment
			
 
				-# copy everything except the folder of `envs` & `pkgs` (two large folders)
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    --exclude='envs' \
			
 
				-    --exclude='pkgs' \
			
 
				-    /eval_workspace/miniforge3 /swe_util/
			
 
				-
			
 
				-# copy pkgs in separate layers (~9.4GB)
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/miniforge3/pkgs /swe_util/miniforge3/
			
 
				-
			
 
				-# copy envs in separate layers (except matplotlib & scikit-learn - larger ones)
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    --exclude='matplotlib*' \
			
 
				-    --exclude='scikit-learn*' \
			
 
				-    --exclude='pydata*' \
			
 
				-    /eval_workspace/miniforge3/envs /swe_util/miniforge3/
			
 
				-
			
 
				-# copy the larger ones in separate layers
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/miniforge3/envs/matplotlib* /swe_util/miniforge3/envs/
			
 
				-
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/miniforge3/envs/scikit-learn* /swe_util/miniforge3/envs/
			
 
				-
			
 
				-RUN --mount=type=bind,source=./eval_workspace,target=/eval_workspace \
			
 
				-    rsync -ar --progress \
			
 
				-    /eval_workspace/miniforge3/envs/pydata* /swe_util/miniforge3/envs/
			
 
				-
			
 
				-# pushd evaluation/SWE-bench
			
 
				-# docker build -t ghcr.io/opendevin/eval-swe-bench:full_deps -f ./scripts/docker/Dockerfile.full_deps .
			
--- a/evaluation/swe_bench/scripts/docker/README.md
+++ b/evaluation/swe_bench/scripts/docker/README.md
@@ -1,13 +0,0 @@
 
				-# Docker Build Guide
			
 
				-
			
 
				-## Builder
			
 
				-
			
 
				-This constructs docker container used for `evaluation/swe_bench/scripts/prepare_swe_utils.sh` that downloads the datasets.
			
 
				-
			
 
				-```bash
			
 
				-pushd evaluation/swe_bench
			
 
				-# This builds base image with basic dependencies
			
 
				-docker build -t ghcr.io/opendevin/eval-swe-bench:builder -f ./scripts/docker/Dockerfile.builder .
			
 
				-# This builds image with SWE-Bench conda environment pre-installed
			
 
				-docker build -t ghcr.io/opendevin/eval-swe-bench:builder_with_conda -f ./scripts/docker/Dockerfile.builder_with_conda .
			
 
				-```
			
--- a/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
+++ b/evaluation/swe_bench/scripts/eval/download_swe_bench_data.py
@@ -1,34 +0,0 @@
 
				-import argparse
			
 
				-import json
			
 
				-
			
 
				-import pandas as pd
			
 
				-from datasets import load_dataset
			
 
				-
			
 
				-parser = argparse.ArgumentParser()
			
 
				-parser.add_argument(
			
 
				-    'output_dir',
			
 
				-    type=str,
			
 
				-    default='eval_data/instances',
			
 
				-    help='Path to the directory to save the instances.',
			
 
				-)
			
 
				-args = parser.parse_args()
			
 
				-
			
 
				-dataset = load_dataset('princeton-nlp/SWE-bench')
			
 
				-test = dataset['test'].to_pandas()
			
 
				-test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
			
 
				-test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
			
 
				-test.to_json(f'{args.output_dir}/swe-bench-test.json', orient='records')
			
 
				-
			
 
				-dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
			
 
				-test = dataset['test'].to_pandas()
			
 
				-test['FAIL_TO_PASS'] = test['FAIL_TO_PASS'].apply(json.loads)
			
 
				-test['PASS_TO_PASS'] = test['PASS_TO_PASS'].apply(json.loads)
			
 
				-test.to_json(f'{args.output_dir}/swe-bench-lite-test.json', orient='records')
			
 
				-
			
 
				-dev = dataset['dev'].to_pandas()
			
 
				-dev['FAIL_TO_PASS'] = dev['FAIL_TO_PASS'].apply(json.loads)
			
 
				-dev['PASS_TO_PASS'] = dev['PASS_TO_PASS'].apply(json.loads)
			
 
				-dev.to_json(f'{args.output_dir}/swe-bench-lite-dev.json', orient='records')
			
 
				-
			
 
				-all_data = pd.concat([test, dev])
			
 
				-all_data.to_json(f'{args.output_dir}/swe-bench-lite-all.json', orient='records')
			
--- a/evaluation/swe_bench/scripts/setup/_swe_env_setup.sh
+++ b/evaluation/swe_bench/scripts/setup/_swe_env_setup.sh
@@ -1,81 +0,0 @@
 
				-#!/bin/bash
			
 
				-# THIS SCRIPT ONLY NEED TO BE RUN ONCE BEFORE EVALUATION
			
 
				-set -e
			
 
				-
			
 
				-function setup_environment_and_testbed {
			
 
				-    local instance_file_name=$1
			
 
				-
			
 
				-    # throw error if user name is not opendevin
			
 
				-    if [ "$USER" != "opendevin" ]; then
			
 
				-        echo "Error: This script is intended to be run by the 'opendevin' user only." >&2
			
 
				-        exit 1
			
 
				-    fi
			
 
				-
			
 
				-    # =======================================================
			
 
				-    # Install & Setup Conda
			
 
				-
			
 
				-    # assume /swe_util/miniforge3 already exists
			
 
				-    # install if swe-util does NOT have conda
			
 
				-    if [ ! -d /swe_util/miniforge3 ]; then
			
 
				-        pushd /swe_util
			
 
				-        echo "Downloading and installing Miniforge3"
			
 
				-        wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
			
 
				-        bash Miniforge3-$(uname)-$(uname -m).sh -b -p /swe_util/miniforge3
			
 
				-    fi
			
 
				-
			
 
				-    echo 'export PATH=/swe_util/miniforge3/bin:$PATH' >> ~/.bashrc
			
 
				-    eval "$(/swe_util/miniforge3/bin/conda shell.bash hook)"
			
 
				-    conda init bash
			
 
				-    source ~/.bashrc
			
 
				-    conda config --set changeps1 False
			
 
				-    conda config --append channels conda-forge
			
 
				-
			
 
				-    # =======================================================
			
 
				-    # Install swe-bench-eval environment if it does not exist
			
 
				-    ENV_EXISTS=$(conda info --envs | awk '/swe-bench-eval/ {print $1}')
			
 
				-    echo "ENV_EXISTS: $ENV_EXISTS"
			
 
				-    if [ -z "$ENV_EXISTS" ]; then
			
 
				-        echo "Environment swe-bench-eval does not exist. Creating the environment."
			
 
				-        conda create -n swe-bench-eval python==3.11.5 -y
			
 
				-        conda activate swe-bench-eval
			
 
				-        pip install requests python-dotenv GitPython datasets pandas beautifulsoup4 ghapi
			
 
				-    fi
			
 
				-    conda activate swe-bench-eval
			
 
				-    echo 'swe-bench-eval environment is ready.'
			
 
				-
			
 
				-    # =======================================================
			
 
				-    # Read the swe-bench-test-lite.json / swe-bench-test.json file and extract the required item based on instance_id
			
 
				-    INSTANCE_DATA_FILE=/swe_util/eval_data/instances/$instance_file_name
			
 
				-    echo "Instance data file loaded: $INSTANCE_DATA_FILE"
			
 
				-
			
 
				-    # =======================================================
			
 
				-    # generate testbed & conda environment for ALL instances in the test file
			
 
				-    echo "Generating testbed & conda environment for all instances in the test file"
			
 
				-    export PYTHONPATH=/swe_util/OD-SWE-bench:$PYTHONPATH
			
 
				-    python3 /swe_util/OD-SWE-bench/swebench/harness/engine_testbed.py \
			
 
				-        --instances_path $INSTANCE_DATA_FILE \
			
 
				-        --log_dir /swe_util/eval_data/testbed_logs \
			
 
				-        --conda_path /swe_util/miniforge3 \
			
 
				-        --testbed /swe_util/eval_data/testbeds \
			
 
				-        --timeout 1000
			
 
				-
			
 
				-    # Check every log in /swe_util/eval_data/testbed_logs to see if they contains "Init Succeeded"
			
 
				-    # If not, print the log file name and exit
			
 
				-    for log_file in /swe_util/eval_data/testbed_logs/*; do
			
 
				-        if ! grep -q "Init Succeeded" $log_file; then
			
 
				-            echo "Error: $log_file does not contain 'Init Succeeded'"
			
 
				-            exit 1
			
 
				-        fi
			
 
				-    done
			
 
				-    echo "All logs contain 'Init Succeeded'. Testbed & conda environment setup is successful."
			
 
				-}
			
 
				-
			
 
				-# check if $1 is either swe-bench-test-lite.json or swe-bench-test.json
			
 
				-if [ "$1" != "swe-bench-test-lite.json" ] && [ "$1" != "swe-bench-test.json" ]; then
			
 
				-    echo "Error: Invalid input file name. Please provide either swe-bench-test-lite.json or swe-bench-test.json"
			
 
				-    exit 1
			
 
				-fi
			
 
				-
			
 
				-# call the function
			
 
				-echo "Calling setup_environment_and_testbed with $1"
			
 
				-setup_environment_and_testbed $1
			
--- a/evaluation/swe_bench/scripts/setup/get_agent_report.sh
+++ b/evaluation/swe_bench/scripts/setup/get_agent_report.sh
@@ -1,86 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-# Initialize variables
			
 
				-output_file=""
			
 
				-agent_name=""
			
 
				-dataset=""
			
 
				-num_processes=15
			
 
				-experiment_name=""
			
 
				-merge_report=false
			
 
				-
			
 
				-# Parse command-line arguments
			
 
				-while [[ "$#" -gt 0 ]]; do
			
 
				-    case $1 in
			
 
				-        --output-file) output_file="$2"; shift ;;
			
 
				-        --agent-name) agent_name="$2"; shift ;;
			
 
				-        --dataset) dataset="$2"; shift ;;
			
 
				-        --num-processes) num_processes="$2"; shift ;;
			
 
				-        --experiment-name) experiment_name="$2"; shift ;;
			
 
				-        --merge-report) merge_report=true ;;
			
 
				-        *) echo "Unknown parameter passed: $1"; exit 1 ;;
			
 
				-    esac
			
 
				-    shift
			
 
				-done
			
 
				-
			
 
				-# Check if arguments are provided
			
 
				-if [[ -z "$output_file" || -z "$agent_name" || -z "$dataset" ]]; then
			
 
				-    echo "output-file, agent-name and dataset are required!"
			
 
				-    exit 1
			
 
				-fi
			
 
				-echo "output file: $output_file"
			
 
				-echo "agent name: $agent_name"
			
 
				-echo "dataset: $dataset"
			
 
				-echo "num processes: $num_processes"
			
 
				-if [ ! -z "$experiment_name" ]
			
 
				-then
			
 
				-    echo "use provided experiment name: $experiment_name"
			
 
				-else
			
 
				-    current_folder=$(basename $(dirname $output_file))
			
 
				-    parent_foler=$(basename $(dirname $(dirname $output_file)))
			
 
				-    experiment_name="${parent_foler}_${current_folder}"
			
 
				-    echo "use generated experiment name: $experiment_name"
			
 
				-fi
			
 
				-
			
 
				-# Convert the agent output to the SWE-Bench format
			
 
				-if [ -z "$EVAL_DATA_DIR" ]; then
			
 
				-    echo "EVAL_DATA_DIR is not set."
			
 
				-    exit 1
			
 
				-fi
			
 
				-target_file="${EVAL_DATA_DIR}/outputs/${experiment_name}_${dataset}.json"
			
 
				-python process_output_json_file.py $output_file $agent_name $target_file
			
 
				-
			
 
				-# Run the evaluation script
			
 
				-if [ -z "$OD_SWE_BENCH" ]; then
			
 
				-    echo "OD_SWE_BENCH is not set."
			
 
				-    exit 1
			
 
				-fi
			
 
				-if [ -z "$MINICONDA3" ]; then
			
 
				-    echo "MINICONDA3 is not set."
			
 
				-    exit 1
			
 
				-fi
			
 
				-mkdir -p $EVAL_DATA_DIR/eval_logs/$experiment_name
			
 
				-export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/harness/run_evaluation.py \
			
 
				-    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
			
 
				-    --temp_dir $EVAL_DATA_DIR/eval_temp \
			
 
				-    --testbed $EVAL_DATA_DIR/testbeds \
			
 
				-    --conda_path $MINICONDA3 \
			
 
				-    --predictions_path $target_file \
			
 
				-    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name \
			
 
				-    --num_processes 15 \
			
 
				-    --skip_existing \
			
 
				-    --timeout 1600 \
			
 
				-    --verbose
			
 
				-
			
 
				-# Get the report
			
 
				-cp $target_file $EVAL_DATA_DIR/eval_logs/$experiment_name
			
 
				-export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/metrics/get_model_report.py \
			
 
				-	--model $agent_name \
			
 
				-    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
			
 
				-    --predictions_path $EVAL_DATA_DIR/eval_logs/$experiment_name/${experiment_name}_${dataset}.json \
			
 
				-    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name/$agent_name
			
 
				-
			
 
				-# Merge report to the agent output
			
 
				-if [ "$merge_report" = true ]; then
			
 
				-    cd /swe_util && python merge_fine_grained_report.py --od_output_file $output_file \
			
 
				-    --fine_grained_report_file $EVAL_DATA_DIR/eval_logs/$experiment_name/${experiment_name}_${dataset}.report.json
			
 
				-fi
			
--- a/evaluation/swe_bench/scripts/setup/get_model_report.sh
+++ b/evaluation/swe_bench/scripts/setup/get_model_report.sh
@@ -1,61 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-# Input arguments
			
 
				-output_file=""
			
 
				-model_name=""
			
 
				-dataset=""
			
 
				-num_processes=15
			
 
				-experiment_name=""
			
 
				-
			
 
				-# Parse command-line arguments
			
 
				-while [[ "$#" -gt 0 ]]; do
			
 
				-    case $1 in
			
 
				-        --output-file) output_file="$2"; shift ;;
			
 
				-        --model-name) model_name="$2"; shift ;;
			
 
				-        --dataset) dataset="$2"; shift ;;
			
 
				-        --num-processes) num_processes="$2"; shift ;;
			
 
				-        --experiment-name) experiment_name="$2"; shift ;;
			
 
				-        *) echo "Unknown parameter passed: $1"; exit 1 ;;
			
 
				-    esac
			
 
				-    shift
			
 
				-done
			
 
				-
			
 
				-# Check if arguments are provided
			
 
				-if [[ -z "$output_file" || -z "$model_name" || -z "$dataset" ]]; then
			
 
				-    echo "output-file, model-name and dataset are required!"
			
 
				-    exit 1
			
 
				-fi
			
 
				-echo "output file: $output_file"
			
 
				-echo "model name: $model_name"
			
 
				-echo "dataset: $dataset"
			
 
				-echo "num processes: $num_processes"
			
 
				-if [ ! -z "$experiment_name" ]
			
 
				-then
			
 
				-    echo "use provided experiment name: $experiment_name"
			
 
				-else
			
 
				-    experiment_name=${model_name}__${dataset}
			
 
				-    echo "use generated experiment name: $experiment_name"
			
 
				-fi
			
 
				-
			
 
				-# Run the evaluation script
			
 
				-mkdir -p $EVAL_DATA_DIR/eval_logs/$experiment_name
			
 
				-export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/harness/run_evaluation.py \
			
 
				-    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
			
 
				-    --temp_dir $EVAL_DATA_DIR/eval_temp \
			
 
				-    --testbed $EVAL_DATA_DIR/testbeds \
			
 
				-    --conda_path $MINICONDA3 \
			
 
				-    --predictions_path $output_file \
			
 
				-    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name \
			
 
				-    --num_processes $num_processes \
			
 
				-    --skip_existing \
			
 
				-    --timeout 1600 \
			
 
				-    --verbose
			
 
				-
			
 
				-# Get the report
			
 
				-predictions_fname=$(basename $output_file)
			
 
				-cp $output_file $EVAL_DATA_DIR/eval_logs/$experiment_name
			
 
				-export PYTHONPATH=$OD_SWE_BENCH && cd $OD_SWE_BENCH && . $MINICONDA3/etc/profile.d/conda.sh && conda activate $MINICONDA3/envs/swe-bench-eval && python swebench/metrics/get_model_report.py \
			
 
				-	--model $model_name \
			
 
				-    --swe_bench_tasks $EVAL_DATA_DIR/instances/$dataset.json \
			
 
				-    --predictions_path $EVAL_DATA_DIR/eval_logs/$experiment_name/$predictions_fname \
			
 
				-    --log_dir $EVAL_DATA_DIR/eval_logs/$experiment_name/$model_name
			
--- a/evaluation/swe_bench/scripts/setup/merge_fine_grained_report.py
+++ b/evaluation/swe_bench/scripts/setup/merge_fine_grained_report.py
@@ -1,29 +0,0 @@
 
				-import argparse
			
 
				-import json
			
 
				-
			
 
				-
			
 
				-def merge_fine_grained_report(od_output_file, fine_grained_report_file):
			
 
				-    merged_od_output_file = od_output_file.replace('.jsonl', '.merged.jsonl')
			
 
				-    merged_report = []
			
 
				-    fine_grained_report = json.load(open(fine_grained_report_file))
			
 
				-    for line in open(od_output_file):
			
 
				-        line = json.loads(line)
			
 
				-        instance_id = line['instance_id']
			
 
				-        line['fine_grained_report'] = fine_grained_report[instance_id]
			
 
				-        merged_report.append(line)
			
 
				-    # dump the merged report as a jsonl file
			
 
				-    with open(merged_od_output_file, 'w') as f:
			
 
				-        for line in merged_report:
			
 
				-            f.write(json.dumps(line) + '\n')
			
 
				-    print(f'Agent output with report merged created at {merged_od_output_file}')
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    parser.add_argument('--od_output_file', help='Path to the OD output file')
			
 
				-    parser.add_argument(
			
 
				-        '--fine_grained_report_file', help='Path to the fine grained report file'
			
 
				-    )
			
 
				-    args = parser.parse_args()
			
 
				-
			
 
				-    merge_fine_grained_report(args.od_output_file, args.fine_grained_report_file)
			
--- a/evaluation/swe_bench/scripts/setup/process_output_json_file.py
+++ b/evaluation/swe_bench/scripts/setup/process_output_json_file.py
@@ -1,35 +0,0 @@
 
				-import json
			
 
				-import sys
			
 
				-
			
 
				-
			
 
				-def process_jsonl(input_file, model_name, output_file):
			
 
				-    try:
			
 
				-        with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
			
 
				-            data = []
			
 
				-            for line in infile:
			
 
				-                if line.strip():  # Ensure the line is not empty
			
 
				-                    json_obj = json.loads(line)
			
 
				-                    # Create new object with required fields and new model_name
			
 
				-                    new_obj = {
			
 
				-                        'instance_id': json_obj['instance_id'],
			
 
				-                        'model_patch': json_obj['git_patch'],
			
 
				-                        'model_name_or_path': model_name,
			
 
				-                    }
			
 
				-                    data.append(new_obj)
			
 
				-            json.dump(
			
 
				-                data, outfile, indent=2
			
 
				-            )  # Write the list of JSON objects to a file
			
 
				-        print(f'Output JSON list created at {output_file}')
			
 
				-    except Exception as e:
			
 
				-        print(f'Error: {e}')
			
 
				-
			
 
				-
			
 
				-# Usage: python script.py input.jsonl model_name output.json
			
 
				-if __name__ == '__main__':
			
 
				-    if len(sys.argv) != 4:
			
 
				-        print('Usage: python script.py <input_file> <model_name> <output_file>')
			
 
				-    else:
			
 
				-        input_file = sys.argv[1]
			
 
				-        model_name = sys.argv[2]
			
 
				-        output_file = sys.argv[3]
			
 
				-        process_jsonl(input_file, model_name, output_file)
			
--- a/evaluation/swe_bench/scripts/setup/swe_env_setup.sh
+++ b/evaluation/swe_bench/scripts/setup/swe_env_setup.sh
@@ -1,31 +0,0 @@
 
				-#!/bin/bash
			
 
				-# THIS SCRIPT ONLY NEED TO BE RUN ONCE BEFORE EVALUATION
			
 
				-
			
 
				-EVAL_DOCKER_IMAGE=ghcr.io/opendevin/eval-swe-bench:builder
			
 
				-EVAL_WORKSPACE="evaluation/swe_bench/eval_workspace"
			
 
				-EVAL_WORKSPACE=$(realpath $EVAL_WORKSPACE)
			
 
				-
			
 
				-SETUP_INSTANCE_FILENAME=swe-bench-test.json # OR swe-bench-test-lite.json
			
 
				-
			
 
				-if [ ! -d $EVAL_WORKSPACE ]; then
			
 
				-    mkdir -p $EVAL_WORKSPACE
			
 
				-fi
			
 
				-
			
 
				-if [ -f $EVAL_WORKSPACE/swe_env_setup.sh ]; then
			
 
				-    rm $EVAL_WORKSPACE/swe_env_setup.sh
			
 
				-fi
			
 
				-SCRIPT_DIR=evaluation/swe_bench/scripts/setup
			
 
				-
			
 
				-cp $SCRIPT_DIR/_swe_env_setup.sh $EVAL_WORKSPACE/swe_env_setup.sh
			
 
				-cp $SCRIPT_DIR/swe_entry.sh $EVAL_WORKSPACE/swe_entry.sh
			
 
				-cp $SCRIPT_DIR/get_model_report.sh $EVAL_WORKSPACE/get_model_report.sh
			
 
				-cp $SCRIPT_DIR/get_agent_report.sh $EVAL_WORKSPACE/get_agent_report.sh
			
 
				-cp $SCRIPT_DIR/process_output_json_file.py $EVAL_WORKSPACE/process_output_json_file.py
			
 
				-cp $SCRIPT_DIR/merge_fine_grained_report.py $EVAL_WORKSPACE/merge_fine_grained_report.py
			
 
				-
			
 
				-docker run \
			
 
				-    -v $EVAL_WORKSPACE:/swe_util \
			
 
				-    -e UID=$(id -u) \
			
 
				-    --rm -it $EVAL_DOCKER_IMAGE \
			
 
				-    bash -c "useradd -rm -d /home/opendevin -s /bin/bash -u $(id -u) opendevin && su opendevin -c 'bash /swe_util/swe_env_setup.sh $SETUP_INSTANCE_FILENAME'"
			
 
				-#