1 год назад · 245334e89d
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -12,16 +12,22 @@ n_runtimes=$(echo $response | jq -r '.total')
 
				 echo "Found ${n_runtimes} runtimes. Stopping them..."
			
 
				 
			
 
				 runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
			
 
				-# Loop through each runtime and stop it
			
 
				-counter=1
			
 
				-for runtime_id in $runtime_ids; do
			
 
				+
			
 
				+# Function to stop a single runtime
			
 
				+stop_runtime() {
			
 
				+  local runtime_id=$1
			
 
				+  local counter=$2
			
 
				   echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
			
 
				   curl --silent --location --request POST "${BASE_URL}/stop" \
			
 
				     --header "X-API-Key: ${ALLHANDS_API_KEY}" \
			
 
				     --header "Content-Type: application/json" \
			
 
				     --data-raw "{\"runtime_id\": \"${runtime_id}\"}"
			
 
				   echo
			
 
				-  ((counter++))
			
 
				-done
			
 
				+}
			
 
				+export -f stop_runtime
			
 
				+export BASE_URL ALLHANDS_API_KEY n_runtimes
			
 
				+
			
 
				+# Use GNU Parallel to stop runtimes in parallel
			
 
				+echo "$runtime_ids" | parallel -j 16 --progress stop_runtime {} {#}
			
 
				 
			
 
				 echo "All runtimes have been stopped."
			
--- a/evaluation/swe_bench/scripts/eval/summarize_outputs.py
+++ b/evaluation/swe_bench/scripts/eval/summarize_outputs.py
@@ -0,0 +1,76 @@
 
				+#!/usr/bin/env python3
			
 
				+import argparse
			
 
				+import json
			
 
				+from collections import Counter
			
 
				+
			
 
				+ERROR_KEYWORDS = [
			
 
				+    'Agent encountered an error while processing the last action',
			
 
				+    'APIError',
			
 
				+    'Action execution failed',
			
 
				+]
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('output_file', type=str, help='The file to summarize')
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    with open(args.output_file, 'r') as file:
			
 
				+        lines = file.readlines()
			
 
				+
			
 
				+    num_lines = len(lines)
			
 
				+    num_error_lines = 0
			
 
				+    num_agent_stuck_in_loop = 0
			
 
				+
			
 
				+    num_resolved = 0
			
 
				+    num_empty_patch = 0
			
 
				+
			
 
				+    error_counter = Counter()
			
 
				+
			
 
				+    for line in lines:
			
 
				+        _d = json.loads(line)
			
 
				+        patch = _d.get('test_result', {}).get('git_patch', '')
			
 
				+        if patch == '':
			
 
				+            num_empty_patch += 1
			
 
				+            continue
			
 
				+
			
 
				+        report = _d.get('report', {}) or {}
			
 
				+        resolved = report.get('resolved', False)
			
 
				+        if resolved:
			
 
				+            num_resolved += 1
			
 
				+
			
 
				+        error = _d.get('error', None)
			
 
				+
			
 
				+        if error is not None and isinstance(error, str):
			
 
				+            agent_stuck_in_loop = 'Agent got stuck in a loop' in error
			
 
				+            contains_error = bool(error) and not agent_stuck_in_loop
			
 
				+            if agent_stuck_in_loop:
			
 
				+                error_counter['Agent got stuck in a loop'] += 1
			
 
				+                num_agent_stuck_in_loop += 1
			
 
				+            elif contains_error:
			
 
				+                error_counter[error] += 1
			
 
				+            continue
			
 
				+
			
 
				+        for keyword in ERROR_KEYWORDS:
			
 
				+            if keyword in line:
			
 
				+                error_counter[keyword] += 1
			
 
				+                num_error_lines += 1
			
 
				+                break
			
 
				+
			
 
				+    # print the error counter (with percentage)
			
 
				+    print('-' * 100)
			
 
				+    print(
			
 
				+        f'# of resolved: {num_resolved} / {num_lines} ({num_resolved / num_lines * 100:.2f}%)'
			
 
				+    )
			
 
				+    print(
			
 
				+        f'# of empty patch: {num_empty_patch} / {num_lines} ({num_empty_patch / num_lines * 100:.2f}%)'
			
 
				+    )
			
 
				+    print(
			
 
				+        f'# of error lines: {num_error_lines} / {num_lines} ({num_error_lines / num_lines * 100:.2f}%)'
			
 
				+    )
			
 
				+    print(
			
 
				+        f'# of loop: {num_agent_stuck_in_loop} / {num_lines} ({num_agent_stuck_in_loop / num_lines * 100:.2f}%)'
			
 
				+    )
			
 
				+    print('-' * 100)
			
 
				+    print('Detailed error breakdown:')
			
 
				+    for error, count in error_counter.items():
			
 
				+        print(f'{error}: {count} ({count / num_lines * 100:.2f}%)')
			
--- a/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
+++ b/evaluation/swe_bench/scripts/eval/update_output_with_eval.py
@@ -10,16 +10,36 @@ parser.add_argument('input_file', type=str)
 
				 args = parser.parse_args()
			
 
				 
			
 
				 dirname = os.path.dirname(args.input_file)
			
 
				-report_json = os.path.join(dirname, 'report.json')
			
 
				 
			
 
				 df = pd.read_json(args.input_file, lines=True)
			
 
				 
			
 
				-output_md_filepath = os.path.join(dirname, 'README.md')
			
 
				 instance_id_to_status = defaultdict(
			
 
				-    lambda: {'resolved': False, 'empty_generation': False}
			
 
				+    lambda: {
			
 
				+        'empty_generation': False,
			
 
				+        'resolved': False,
			
 
				+        'failed_apply_patch': False,
			
 
				+        'error_eval': False,
			
 
				+        'test_timeout': False,
			
 
				+    }
			
 
				 )
			
 
				-if os.path.exists(report_json):
			
 
				-    with open(report_json, 'r') as f:
			
 
				+
			
 
				+
			
 
				+# Apply the status to the dataframe
			
 
				+def apply_report(row):
			
 
				+    instance_id = row['instance_id']
			
 
				+    if instance_id in instance_id_to_status:
			
 
				+        return dict(instance_id_to_status[instance_id])
			
 
				+    return row.get('report', {})
			
 
				+
			
 
				+
			
 
				+swebench_official_report_json = os.path.join(dirname, 'report.json')
			
 
				+openhands_remote_report_jsonl = args.input_file.replace(
			
 
				+    '.jsonl', '.swebench_eval.jsonl'
			
 
				+)
			
 
				+
			
 
				+if os.path.exists(swebench_official_report_json):
			
 
				+    output_md_filepath = os.path.join(dirname, 'README.md')
			
 
				+    with open(swebench_official_report_json, 'r') as f:
			
 
				         report = json.load(f)
			
 
				 
			
 
				     output_md = (
			
@@ -70,15 +90,101 @@ if os.path.exists(report_json):
 
				             f'- [{instance_id}](./eval_outputs/{instance_id}/run_instance.log)\n'
			
 
				         )
			
 
				 
			
 
				-    # Apply the status to the dataframe
			
 
				-    def apply_report(row):
			
 
				-        instance_id = row['instance_id']
			
 
				-        if instance_id in instance_id_to_status:
			
 
				-            return dict(instance_id_to_status[instance_id])
			
 
				-        return row.get('report', {})
			
 
				+    df['report'] = df.apply(apply_report, axis=1)
			
 
				+
			
 
				+    with open(output_md_filepath, 'w') as f:
			
 
				+        f.write(output_md)
			
 
				 
			
 
				+elif os.path.exists(openhands_remote_report_jsonl):
			
 
				+    output_md_filepath = args.input_file.replace('.jsonl', '.swebench_eval.md')
			
 
				+
			
 
				+    df_eval = pd.read_json(openhands_remote_report_jsonl, lines=True, orient='records')
			
 
				+
			
 
				+    assert len(df['instance_id'].unique()) == len(
			
 
				+        df
			
 
				+    ), 'There are duplicate instance ids in the original output which is not allowed'
			
 
				+    assert len(df_eval['instance_id'].unique()) == len(
			
 
				+        df_eval
			
 
				+    ), 'There are duplicate instance ids in the eval report which is not allowed'
			
 
				+
			
 
				+    for _, row in df_eval.iterrows():
			
 
				+        instance_id_to_status[row['instance_id']] = row['test_result']['report']
			
 
				     df['report'] = df.apply(apply_report, axis=1)
			
 
				 
			
 
				+    _n_instances = len(df)
			
 
				+    _n_resolved = len(df[df['report'].apply(lambda x: x.get('resolved', False))])
			
 
				+    _n_unresolved = _n_instances - _n_resolved
			
 
				+    _n_empty_patch = len(
			
 
				+        df[df['report'].apply(lambda x: x.get('empty_generation', False))]
			
 
				+    )
			
 
				+    _n_error = len(df[df['report'].apply(lambda x: x.get('error_eval', False))])
			
 
				+    output_md = (
			
 
				+        '# SWE-bench Report\n'
			
 
				+        'This folder contains the evaluation results of the SWE-bench using the [official evaluation docker containerization](https://github.com/princeton-nlp/SWE-bench/blob/main/docs/20240627_docker/README.md#choosing-the-right-cache_level).\n\n'
			
 
				+        '## Summary\n'
			
 
				+        f'- submitted instances: {_n_instances}\n'
			
 
				+        f'- empty patch instances: {_n_empty_patch}\n'
			
 
				+        f'- resolved instances: {_n_resolved}\n'
			
 
				+        f'- unresolved instances: {_n_unresolved}\n'
			
 
				+        f'- error instances: {_n_error}\n'
			
 
				+    )
			
 
				+
			
 
				+    def _instance_id_to_log_path(instance_id):
			
 
				+        path = f"{args.input_file.replace('.jsonl', '.swebench_eval.logs')}/instance_{instance_id}.log"
			
 
				+        # make it relative path
			
 
				+        path = os.path.relpath(path, start=dirname)
			
 
				+        return path
			
 
				+
			
 
				+    output_md += '\n## Resolved Instances\n'
			
 
				+    # instance_id to status
			
 
				+    for instance_id in sorted(
			
 
				+        df[df['report'].apply(lambda x: x.get('resolved', False))][
			
 
				+            'instance_id'
			
 
				+        ].unique()
			
 
				+    ):
			
 
				+        instance_id_to_status[instance_id]['resolved'] = True
			
 
				+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
			
 
				+
			
 
				+    output_md += '\n## Unresolved Instances\n'
			
 
				+    for instance_id in sorted(
			
 
				+        df[~df['report'].apply(lambda x: x.get('resolved', False))][
			
 
				+            'instance_id'
			
 
				+        ].unique()
			
 
				+    ):
			
 
				+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
			
 
				+
			
 
				+    output_md += '\n## Error Instances\n'
			
 
				+    for instance_id in sorted(
			
 
				+        df[df['report'].apply(lambda x: x.get('error_eval', False))][
			
 
				+            'instance_id'
			
 
				+        ].unique()
			
 
				+    ):
			
 
				+        instance_id_to_status[instance_id]['error_eval'] = True
			
 
				+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
			
 
				+
			
 
				+    output_md += '\n## Empty Patch Instances\n'
			
 
				+    for instance_id in sorted(
			
 
				+        df[df['report'].apply(lambda x: x.get('empty_generation', False))][
			
 
				+            'instance_id'
			
 
				+        ].unique()
			
 
				+    ):
			
 
				+        instance_id_to_status[instance_id]['empty_generation'] = True
			
 
				+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
			
 
				+
			
 
				+    output_md += '\n## Incomplete Instances\n'
			
 
				+    for instance_id in sorted(
			
 
				+        df[df['report'].apply(lambda x: x.get('test_timeout', False))][
			
 
				+            'instance_id'
			
 
				+        ].unique()
			
 
				+    ):
			
 
				+        output_md += f'- [{instance_id}]({_instance_id_to_log_path(instance_id)})\n'
			
 
				+    with open(output_md_filepath, 'w') as f:
			
 
				+        f.write(output_md)
			
 
				+else:
			
 
				+    print(
			
 
				+        f'No report file found: Both {swebench_official_report_json} and {openhands_remote_report_jsonl} do not exist.'
			
 
				+    )
			
 
				+    exit()
			
 
				 
			
 
				 if os.path.exists(args.input_file + '.bak'):
			
 
				     conf = input('Existing backup file found. Do you want to overwrite it? (y/n)')
			
@@ -89,6 +195,3 @@ if os.path.exists(args.input_file + '.bak'):
 
				 # backup the original file
			
 
				 os.rename(args.input_file, args.input_file + '.bak')
			
 
				 df.to_json(args.input_file, orient='records', lines=True)
			
 
				-
			
 
				-with open(output_md_filepath, 'w') as f:
			
 
				-    f.write(output_md)
			
--- a/evaluation/swe_bench/scripts/eval_infer_remote.sh
+++ b/evaluation/swe_bench/scripts/eval_infer_remote.sh
@@ -41,3 +41,6 @@ fi
 
				 
			
 
				 # Run the command
			
 
				 eval $COMMAND
			
 
				+
			
 
				+# update the output with evaluation results
			
 
				+poetry run python evaluation/swe_bench/scripts/eval/update_output_with_eval.py $INPUT_FILE